diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0b2e4e1be86d86..0da482457ad706 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -67,11 +67,11 @@ clang/test/AST/Interp/ @tbaederr /mlir/include/mlir/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin /mlir/lib/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin /mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache +/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @dcaballe @MaheshRavishankar @nicolasvasilache /mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache /mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @hanhanW @nicolasvasilache +/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @dcaballe @hanhanW @nicolasvasilache +/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @banach-space @dcaballe @hanhanW @nicolasvasilache # MemRef Dialect in MLIR. /mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache @@ -85,10 +85,10 @@ clang/test/AST/Interp/ @tbaederr /mlir/**/*VectorToSCF* @banach-space @dcaballe @matthias-springer @nicolasvasilache /mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache /mlir/**/*X86Vector* @aartbik @dcaballe @nicolasvasilache -/mlir/include/mlir/Dialect/Vector @dcaballe @nicolasvasilache -/mlir/lib/Dialect/Vector @dcaballe @nicolasvasilache -/mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache +/mlir/include/mlir/Dialect/Vector @banach-space @dcaballe @nicolasvasilache +/mlir/lib/Dialect/Vector @banach-space @dcaballe @nicolasvasilache +/mlir/lib/Dialect/Vector/Transforms/* @banach-space @dcaballe @hanhanW @nicolasvasilache +/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @banach-space @dcaballe @MaheshRavishankar @nicolasvasilache /mlir/**/*EmulateNarrowType* @dcaballe @hanhanW # Presburger library in MLIR diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 17c52c65e472fa..bd6e2ec01b53e7 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -283,6 +283,12 @@ List of functions to pad with amount of bytes +- `--print-mappings` + + Print mappings in the legend, between characters/blocks and text sections + (default false). + + - `--profile-format=` Format to dump profile output in aggregation mode, default is fdata @@ -1240,4 +1246,4 @@ - `--print-options` - Print non-default options after command line parsing \ No newline at end of file + Print non-default options after command line parsing diff --git a/bolt/docs/HeatmapHeader.png b/bolt/docs/HeatmapHeader.png new file mode 100644 index 00000000000000..a519dc6215d8cf Binary files /dev/null and b/bolt/docs/HeatmapHeader.png differ diff --git a/bolt/docs/Heatmaps.md b/bolt/docs/Heatmaps.md index 4bae8ed5410df2..bf68232ef7fee9 100644 --- a/bolt/docs/Heatmaps.md +++ b/bolt/docs/Heatmaps.md @@ -1,9 +1,9 @@ # Code Heatmaps BOLT has gained the ability to print code heatmaps based on -sampling-based LBR profiles generated by `perf`. The output is produced -in colored ASCII to be displayed in a color-capable terminal. It looks -something like this: +sampling-based profiles generated by `perf`, either with `LBR` data or not. +The output is produced in colored ASCII to be displayed in a color-capable +terminal. It looks something like this: ![](./Heatmap.png) @@ -32,16 +32,8 @@ $ llvm-bolt-heatmap -p perf.data ``` By default the heatmap will be dumped to *stdout*. You can change it -with `-o ` option. Each character/block in the heatmap -shows the execution data accumulated for corresponding 64 bytes of -code. You can change this granularity with a `-block-size` option. -E.g. set it to 4096 to see code usage grouped by 4K pages. -Other useful options are: +with `-o ` option. -```bash --line-size= - number of entries per line (default 256) --max-address= - maximum address considered valid for heatmap (default 4GB) -``` If you prefer to look at the data in a browser (or would like to share it that way), then you can use an HTML conversion tool. E.g.: @@ -49,3 +41,55 @@ it that way), then you can use an HTML conversion tool. E.g.: ```bash $ aha -b -f > .html ``` + +--- + +## Background on heatmaps: +A heatmap is effectively a histogram that is rendered into a grid for better +visualization. +In theory we can generate a heatmap using any binary and a perf profile. + +Each block/character in the heatmap shows the execution data accumulated for +corresponding 64 bytes of code. You can change this granularity with a +`-block-size` option. +E.g. set it to 4096 to see code usage grouped by 4K pages. + + +When a block is shown as a dot, it means that no samples were found for that +address. +When it is shown as a letter, it indicates a captured sample on a particular +text section of the binary. +To show a mapping between letters and text sections in the legend, use +`-print-mappings`. +When a sampled address does not belong to any of the text sections, the +characters 'o' or 'O' will be shown. + +The legend shows by default the ranges in the heatmap according to the number +of samples per block. +A color is assigned per range, except the first two ranges that distinguished by +lower and upper case letters. + +On the Y axis, each row/line starts with an actual address of the binary. +Consecutive lines in the heatmap advance by the same amount, with the binary +size covered by a line dependent on the block size and the line size. +An empty new line is inserted for larger gaps between samples. + +On the X axis, the horizontally emitted hex numbers can help *estimate* where +in the line the samples lie, but they cannot be combined to provide a full +address, as they are relative to both the bucket and line sizes. + +In the example below, the highlighted `0x100` column is not an offset to each +row's address, but instead, it points to the middle of the line. +For the generation, the default bucket size was used with a line size of 128. + + +![](./HeatmapHeader.png) + + +Some useful options are: + +``` +-line-size= - number of entries per line (default 256) +-max-address= - maximum address considered valid for heatmap (default 4GB) +-print-mappings - print mappings in the legend, between characters/blocks and text sections (default false) +``` diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h index 2324e577cc7c94..5935ffaa46af7d 100644 --- a/bolt/include/bolt/Core/DebugData.h +++ b/bolt/include/bolt/Core/DebugData.h @@ -256,7 +256,7 @@ class DebugRangeListsSectionWriter : public DebugRangesSectionWriter { }; virtual ~DebugRangeListsSectionWriter(){}; - static void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; } + void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; } /// Add ranges with caching. uint64_t addRanges( @@ -284,7 +284,7 @@ class DebugRangeListsSectionWriter : public DebugRangesSectionWriter { } private: - static DebugAddrWriter *AddrWriter; + DebugAddrWriter *AddrWriter = nullptr; /// Used to find unique CU ID. DWARFUnit *CU; /// Current relative offset of range list entry within this CUs rangelist @@ -336,21 +336,36 @@ using AddressSectionBuffer = SmallVector; class DebugAddrWriter { public: DebugAddrWriter() = delete; - DebugAddrWriter(BinaryContext *BC_); + DebugAddrWriter(BinaryContext *BC_) : DebugAddrWriter(BC_, UCHAR_MAX) {}; + DebugAddrWriter(BinaryContext *BC_, uint8_t AddressByteSize); virtual ~DebugAddrWriter(){}; /// Given an address returns an index in .debug_addr. /// Adds Address to map. uint32_t getIndexFromAddress(uint64_t Address, DWARFUnit &CU); /// Write out entries in to .debug_addr section for CUs. - virtual void update(DIEBuilder &DIEBlder, DWARFUnit &CUs); + virtual std::optional finalize(const size_t BufferSize); /// Return buffer with all the entries in .debug_addr already writen out using /// update(...). - virtual AddressSectionBuffer &finalize() { return *Buffer; } + virtual std::unique_ptr releaseBuffer() { + return std::move(Buffer); + } + + /// Returns buffer size. + virtual size_t getBufferSize() const { return Buffer->size(); } + + /// Returns True if Buffer is not empty. + bool isInitialized() const { return !Buffer->empty(); } - /// Returns False if .debug_addr section was created.. - bool isInitialized() const { return !AddressMaps.empty(); } + /// Updates address base with the given Offset. + virtual void updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU, + const uint64_t Offset); + + /// Appends an AddressSectionBuffer to the address writer's buffer. + void appendToAddressBuffer(const AddressSectionBuffer &Buffer) { + *AddressStream << Buffer; + } protected: class AddressForDWOCU { @@ -407,23 +422,32 @@ class DebugAddrWriter { } BinaryContext *BC; - /// Maps DWOID to AddressForDWOCU. - std::unordered_map AddressMaps; + /// Address for the DWO CU associated with the address writer. + AddressForDWOCU Map; + uint8_t AddressByteSize; /// Mutex used for parallel processing of debug info. std::mutex WriterMutex; std::unique_ptr Buffer; std::unique_ptr AddressStream; /// Used to track sections that were not modified so that they can be re-used. - DenseMap UnmodifiedAddressOffsets; + static DenseMap UnmodifiedAddressOffsets; }; class DebugAddrWriterDwarf5 : public DebugAddrWriter { public: DebugAddrWriterDwarf5() = delete; DebugAddrWriterDwarf5(BinaryContext *BC) : DebugAddrWriter(BC) {} + DebugAddrWriterDwarf5(BinaryContext *BC, uint8_t AddressByteSize, + std::optional AddrOffsetSectionBase) + : DebugAddrWriter(BC, AddressByteSize), + AddrOffsetSectionBase(AddrOffsetSectionBase) {} /// Write out entries in to .debug_addr section for CUs. - virtual void update(DIEBuilder &DIEBlder, DWARFUnit &CUs) override; + virtual std::optional finalize(const size_t BufferSize) override; + + /// Updates address base with the given Offset. + virtual void updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU, + const uint64_t Offset) override; protected: /// Given DWARFUnit \p Unit returns either DWO ID or it's offset within @@ -435,6 +459,10 @@ class DebugAddrWriterDwarf5 : public DebugAddrWriter { } return Unit.getOffset(); } + +private: + std::optional AddrOffsetSectionBase = std::nullopt; + static constexpr uint32_t HeaderSize = 8; }; /// This class is NOT thread safe. @@ -583,12 +611,10 @@ class DebugLoclistWriter : public DebugLocWriter { public: ~DebugLoclistWriter() {} DebugLoclistWriter() = delete; - DebugLoclistWriter(DWARFUnit &Unit, uint8_t DV, bool SD) - : DebugLocWriter(DV, LocWriterKind::DebugLoclistWriter), CU(Unit), - IsSplitDwarf(SD) { - assert(DebugLoclistWriter::AddrWriter && - "Please use SetAddressWriter to initialize " - "DebugAddrWriter before instantiation."); + DebugLoclistWriter(DWARFUnit &Unit, uint8_t DV, bool SD, + DebugAddrWriter &AddrW) + : DebugLocWriter(DV, LocWriterKind::DebugLoclistWriter), + AddrWriter(AddrW), CU(Unit), IsSplitDwarf(SD) { if (DwarfVersion >= 5) { LocBodyBuffer = std::make_unique(); LocBodyStream = std::make_unique(*LocBodyBuffer); @@ -600,8 +626,6 @@ class DebugLoclistWriter : public DebugLocWriter { } } - static void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; } - /// Stores location lists internally to be written out during finalize phase. virtual void addList(DIEBuilder &DIEBldr, DIE &Die, DIEValue &AttrInfo, DebugLocationsVector &LocList) override; @@ -630,7 +654,7 @@ class DebugLoclistWriter : public DebugLocWriter { /// Writes out locations in to a local buffer and applies debug info patches. void finalizeDWARF5(DIEBuilder &DIEBldr, DIE &Die); - static DebugAddrWriter *AddrWriter; + DebugAddrWriter &AddrWriter; DWARFUnit &CU; bool IsSplitDwarf{false}; // Used for DWARF5 to store location lists before being finalized. diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index ab07f07e498455..c916c6f95751fc 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -2041,9 +2041,14 @@ class MCPlusBuilder { return InstructionListType(); } - virtual InstructionListType createDummyReturnFunction(MCContext *Ctx) const { - llvm_unreachable("not implemented"); - return InstructionListType(); + /// Returns a function body that contains only a return instruction. An + /// example usage is a workaround for the '__bolt_fini_trampoline' of + // Instrumentation. + virtual InstructionListType + createReturnInstructionList(MCContext *Ctx) const { + InstructionListType Insts(1); + createReturn(Insts[0]); + return Insts; } /// This method takes an indirect call instruction and splits it up into an diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h index 4f576eaa95576a..abd18b56113b62 100644 --- a/bolt/include/bolt/Rewrite/DWARFRewriter.h +++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h @@ -66,10 +66,6 @@ class DWARFRewriter { /// .debug_aranges DWARF section. std::unique_ptr ARangesSectionWriter; - /// Stores and serializes information that will be put into the - /// .debug_addr DWARF section. - std::unique_ptr AddrWriter; - /// Stores and serializes information that will be put in to the /// .debug_addr DWARF section. /// Does not do de-duplication. @@ -93,6 +89,10 @@ class DWARFRewriter { std::unordered_map> LegacyRangesWritersByCU; + /// Stores address writer for each CU. + std::unordered_map> + AddressWritersByCU; + std::mutex LocListDebugInfoPatchesMutex; /// Dwo id specific its RangesBase. @@ -115,6 +115,7 @@ class DWARFRewriter { void updateUnitDebugInfo(DWARFUnit &Unit, DIEBuilder &DIEBldr, DebugLocWriter &DebugLocWriter, DebugRangesSectionWriter &RangesSectionWriter, + DebugAddrWriter &AddressWriter, std::optional RangesBase = std::nullopt); /// Patches the binary for an object's address ranges to be updated. @@ -141,13 +142,15 @@ class DWARFRewriter { /// Process and write out CUs that are passsed in. void finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer, CUOffsetMap &CUMap, - const std::list &CUs); + const std::list &CUs, + DebugAddrWriter &FinalAddrWriter); /// Finalize debug sections in the main binary. void finalizeDebugSections(DIEBuilder &DIEBlder, DWARF5AcceleratorTable &DebugNamesTable, DIEStreamer &Streamer, raw_svector_ostream &ObjOS, - CUOffsetMap &CUMap); + CUOffsetMap &CUMap, + DebugAddrWriter &FinalAddrWriter); /// Patches the binary for DWARF address ranges (e.g. in functions and lexical /// blocks) to be updated. diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index 30e8bd777b3cac..baabeab577fb5e 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -40,6 +40,7 @@ extern llvm::cl::opt ExecutionCountThreshold; extern llvm::cl::opt HeatmapBlock; extern llvm::cl::opt HeatmapMaxAddress; extern llvm::cl::opt HeatmapMinAddress; +extern llvm::cl::opt HeatmapPrintMappings; extern llvm::cl::opt HotData; extern llvm::cl::opt HotFunctionsAtEnd; extern llvm::cl::opt HotText; diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index 6633eaa9574216..7815a305c05182 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -556,7 +556,17 @@ DWARFDie DIEBuilder::resolveDIEReference( const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, DWARFUnit *&RefCU, DWARFDebugInfoEntry &DwarfDebugInfoEntry) { assert(RefValue.isFormClass(DWARFFormValue::FC_Reference)); - uint64_t RefOffset = *RefValue.getAsReference(); + uint64_t RefOffset; + if (std::optional Off = RefValue.getAsRelativeReference()) { + RefOffset = RefValue.getUnit()->getOffset() + *Off; + } else if (Off = RefValue.getAsDebugInfoReference(); Off) { + RefOffset = *Off; + } else { + BC.errs() + << "BOLT-WARNING: [internal-dwarf-error]: unsupported reference type: " + << FormEncodingString(RefValue.getForm()) << ".\n"; + return DWARFDie(); + } return resolveDIEReference(AttrSpec, RefOffset, RefCU, DwarfDebugInfoEntry); } @@ -607,7 +617,13 @@ void DIEBuilder::cloneDieReferenceAttribute( DIE &Die, const DWARFUnit &U, const DWARFDie &InputDIE, const DWARFAbbreviationDeclaration::AttributeSpec AttrSpec, const DWARFFormValue &Val) { - const uint64_t Ref = *Val.getAsReference(); + uint64_t Ref; + if (std::optional Off = Val.getAsRelativeReference()) + Ref = Val.getUnit()->getOffset() + *Off; + else if (Off = Val.getAsDebugInfoReference(); Off) + Ref = *Off; + else + return; DIE *NewRefDie = nullptr; DWARFUnit *RefUnit = nullptr; diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp index 76c3545c63088c..002f58c4743466 100644 --- a/bolt/lib/Core/DebugData.cpp +++ b/bolt/lib/Core/DebugData.cpp @@ -184,8 +184,6 @@ void DebugRangesSectionWriter::appendToRangeBuffer( *RangesStream << CUBuffer; } -DebugAddrWriter *DebugRangeListsSectionWriter::AddrWriter = nullptr; - uint64_t DebugRangeListsSectionWriter::addRanges( DebugAddressRangesVector &&Ranges, std::map &CachedRanges) { @@ -390,7 +388,9 @@ void DebugARangesSectionWriter::writeARangesSection( } } -DebugAddrWriter::DebugAddrWriter(BinaryContext *BC) : BC(BC) { +DebugAddrWriter::DebugAddrWriter(BinaryContext *BC, + const uint8_t AddressByteSize) + : BC(BC), AddressByteSize(AddressByteSize) { Buffer = std::make_unique(); AddressStream = std::make_unique(*Buffer); } @@ -405,11 +405,6 @@ void DebugAddrWriter::AddressForDWOCU::dump() { } uint32_t DebugAddrWriter::getIndexFromAddress(uint64_t Address, DWARFUnit &CU) { std::lock_guard Lock(WriterMutex); - const uint64_t CUID = getCUID(CU); - if (!AddressMaps.count(CUID)) - AddressMaps[CUID] = AddressForDWOCU(); - - AddressForDWOCU &Map = AddressMaps[CUID]; auto Entry = Map.find(Address); if (Entry == Map.end()) { auto Index = Map.getNextIndex(); @@ -449,29 +444,23 @@ static void updateAddressBase(DIEBuilder &DIEBlder, DebugAddrWriter &AddrWriter, } } -void DebugAddrWriter::update(DIEBuilder &DIEBlder, DWARFUnit &CU) { - // Handling the case where debug information is a mix of Debug fission and - // monolithic. - if (!CU.getDWOId()) - return; - const uint64_t CUID = getCUID(CU); - auto AM = AddressMaps.find(CUID); - // Adding to map even if it did not contribute to .debug_addr. - // The Skeleton CU might still have DW_AT_GNU_addr_base. - uint64_t Offset = Buffer->size(); - // If does not exist this CUs DWO section didn't contribute to .debug_addr. - if (AM == AddressMaps.end()) - return; - std::vector SortedMap(AM->second.indexToAddressBegin(), - AM->second.indexToAdddessEnd()); +void DebugAddrWriter::updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU, + const uint64_t Offset) { + updateAddressBase(DIEBlder, *this, CU, Offset); +} + +std::optional DebugAddrWriter::finalize(const size_t BufferSize) { + if (Map.begin() == Map.end()) + return std::nullopt; + std::vector SortedMap(Map.indexToAddressBegin(), + Map.indexToAdddessEnd()); // Sorting address in increasing order of indices. llvm::sort(SortedMap, llvm::less_first()); - uint8_t AddrSize = CU.getAddressByteSize(); uint32_t Counter = 0; auto WriteAddress = [&](uint64_t Address) -> void { ++Counter; - switch (AddrSize) { + switch (AddressByteSize) { default: assert(false && "Address Size is invalid."); break; @@ -490,10 +479,19 @@ void DebugAddrWriter::update(DIEBuilder &DIEBlder, DWARFUnit &CU) { WriteAddress(0); WriteAddress(Val.second); } - updateAddressBase(DIEBlder, *this, CU, Offset); + return std::nullopt; +} + +void DebugAddrWriterDwarf5::updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU, + const uint64_t Offset) { + /// Header for DWARF5 has size 8, so we add it to the offset. + updateAddressBase(DIEBlder, *this, CU, Offset + HeaderSize); } -void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) { +DenseMap DebugAddrWriter::UnmodifiedAddressOffsets; + +std::optional +DebugAddrWriterDwarf5::finalize(const size_t BufferSize) { // Need to layout all sections within .debug_addr // Within each section sort Address by index. const endianness Endian = BC->DwCtx->isLittleEndian() @@ -504,55 +502,44 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) { Endian == llvm::endianness::little, 0); DWARFDebugAddrTable AddrTable; DIDumpOptions DumpOpts; - constexpr uint32_t HeaderSize = 8; - const uint64_t CUID = getCUID(CU); - const uint8_t AddrSize = CU.getAddressByteSize(); - auto AMIter = AddressMaps.find(CUID); // A case where CU has entry in .debug_addr, but we don't modify addresses // for it. - if (AMIter == AddressMaps.end()) { - AMIter = AddressMaps.insert({CUID, AddressForDWOCU()}).first; - std::optional BaseOffset = CU.getAddrOffsetSectionBase(); - if (!BaseOffset) - return; + if (Map.begin() == Map.end()) { + if (!AddrOffsetSectionBase) + return std::nullopt; // Address base offset is to the first entry. // The size of header is 8 bytes. - uint64_t Offset = *BaseOffset - HeaderSize; + uint64_t Offset = *AddrOffsetSectionBase - HeaderSize; auto Iter = UnmodifiedAddressOffsets.find(Offset); - if (Iter != UnmodifiedAddressOffsets.end()) { - updateAddressBase(DIEBlder, *this, CU, Iter->getSecond()); - return; - } - UnmodifiedAddressOffsets[Offset] = Buffer->size() + HeaderSize; - if (Error Err = AddrTable.extract(AddrData, &Offset, 5, AddrSize, + if (Iter != UnmodifiedAddressOffsets.end()) + return Iter->second; + UnmodifiedAddressOffsets[Offset] = BufferSize; + if (Error Err = AddrTable.extract(AddrData, &Offset, 5, AddressByteSize, DumpOpts.WarningHandler)) { DumpOpts.RecoverableErrorHandler(std::move(Err)); - return; + return std::nullopt; } - uint32_t Index = 0; for (uint64_t Addr : AddrTable.getAddressEntries()) - AMIter->second.insert(Addr, Index++); + Map.insert(Addr, Index++); } - updateAddressBase(DIEBlder, *this, CU, Buffer->size() + HeaderSize); - - std::vector SortedMap(AMIter->second.indexToAddressBegin(), - AMIter->second.indexToAdddessEnd()); + std::vector SortedMap(Map.indexToAddressBegin(), + Map.indexToAdddessEnd()); // Sorting address in increasing order of indices. llvm::sort(SortedMap, llvm::less_first()); // Writing out Header - const uint32_t Length = SortedMap.size() * AddrSize + 4; + const uint32_t Length = SortedMap.size() * AddressByteSize + 4; support::endian::write(*AddressStream, Length, Endian); support::endian::write(*AddressStream, static_cast(5), Endian); - support::endian::write(*AddressStream, static_cast(AddrSize), + support::endian::write(*AddressStream, static_cast(AddressByteSize), Endian); support::endian::write(*AddressStream, static_cast(0), Endian); uint32_t Counter = 0; auto writeAddress = [&](uint64_t Address) -> void { ++Counter; - switch (AddrSize) { + switch (AddressByteSize) { default: llvm_unreachable("Address Size is invalid."); break; @@ -571,6 +558,7 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) { writeAddress(0); writeAddress(Val.second); } + return std::nullopt; } void DebugLocWriter::init() { @@ -723,11 +711,11 @@ void DebugLoclistWriter::addList(DIEBuilder &DIEBldr, DIE &Die, DIEValue &AttrInfo, DebugLocationsVector &LocList) { if (DwarfVersion < 5) - writeLegacyLocList(AttrInfo, LocList, DIEBldr, Die, *AddrWriter, *LocBuffer, + writeLegacyLocList(AttrInfo, LocList, DIEBldr, Die, AddrWriter, *LocBuffer, CU, *LocStream); else writeDWARF5LocList(NumberOfEntries, AttrInfo, LocList, Die, DIEBldr, - *AddrWriter, *LocBodyBuffer, RelativeLocListOffsets, CU, + AddrWriter, *LocBodyBuffer, RelativeLocListOffsets, CU, *LocBodyStream); } @@ -789,8 +777,6 @@ void DebugLoclistWriter::finalize(DIEBuilder &DIEBldr, DIE &Die) { finalizeDWARF5(DIEBldr, Die); } -DebugAddrWriter *DebugLoclistWriter::AddrWriter = nullptr; - static std::string encodeLE(size_t ByteSize, uint64_t NewValue) { std::string LE64(ByteSize, 0); for (size_t I = 0; I < ByteSize; ++I) { diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index e824a42d826964..ebb3925749b4d2 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -754,7 +754,7 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) { // with unknown symbol in runtime library. E.g. for static PIE // executable createSimpleFunction("__bolt_fini_trampoline", - BC.MIB->createDummyReturnFunction(BC.Ctx.get())); + BC.MIB->createReturnInstructionList(BC.Ctx.get())); } } } diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp index 210a5cc98c1041..5fc3e0669352da 100644 --- a/bolt/lib/Profile/Heatmap.cpp +++ b/bolt/lib/Profile/Heatmap.cpp @@ -13,6 +13,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include @@ -164,6 +165,7 @@ void Heatmap::print(raw_ostream &OS) const { // Print map legend OS << "Legend:\n"; + OS << "\nRanges:\n"; uint64_t PrevValue = 0; for (unsigned I = 0; I < sizeof(Range) / sizeof(Range[0]); ++I) { const uint64_t Value = Range[I]; @@ -172,6 +174,22 @@ void Heatmap::print(raw_ostream &OS) const { OS << " : (" << PrevValue << ", " << Value << "]\n"; PrevValue = Value; } + if (opts::HeatmapPrintMappings) { + OS << "\nSections:\n"; + unsigned SectionIdx = 0; + for (auto TxtSeg : TextSections) { + const char Upper = static_cast('A' + ((SectionIdx++) % 26)); + const char Lower = static_cast(std::tolower(Upper)); + OS << formatv(" {0}/{1} : {2,-10} ", Lower, Upper, TxtSeg.Name); + if (MaxAddress > 0xffffffff) + OS << format("0x%016" PRIx64, TxtSeg.BeginAddress) << "-" + << format("0x%016" PRIx64, TxtSeg.EndAddress) << "\n"; + else + OS << format("0x%08" PRIx64, TxtSeg.BeginAddress) << "-" + << format("0x%08" PRIx64, TxtSeg.EndAddress) << "\n"; + } + OS << "\n"; + } // Pos - character position from right in hex form. auto printHeader = [&](unsigned Pos) { diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 89168b4bd559cc..042c39a574561a 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -612,12 +612,15 @@ void DWARFRewriter::updateDebugInfo() { errs() << "BOLT-WARNING: --deterministic-debuginfo is being deprecated\n"; } + /// Stores and serializes information that will be put into the + /// .debug_addr DWARF section. + std::unique_ptr FinalAddrWriter; + if (BC.isDWARF5Used()) { - AddrWriter = std::make_unique(&BC); + FinalAddrWriter = std::make_unique(&BC); RangeListsSectionWriter = std::make_unique(); - DebugRangeListsSectionWriter::setAddressWriter(AddrWriter.get()); } else { - AddrWriter = std::make_unique(&BC); + FinalAddrWriter = std::make_unique(&BC); } if (BC.isDWARFLegacyUsed()) { @@ -625,34 +628,42 @@ void DWARFRewriter::updateDebugInfo() { LegacyRangesSectionWriter->initSection(); } - DebugLoclistWriter::setAddressWriter(AddrWriter.get()); - uint32_t CUIndex = 0; std::mutex AccessMutex; // Needs to be invoked in the same order as CUs are processed. - auto createRangeLocList = [&](DWARFUnit &CU) -> DebugLocWriter * { + auto createRangeLocListAddressWriters = + [&](DWARFUnit &CU) -> DebugLocWriter * { std::lock_guard Lock(AccessMutex); const uint16_t DwarfVersion = CU.getVersion(); if (DwarfVersion >= 5) { + auto AddrW = std::make_unique( + &BC, CU.getAddressByteSize(), CU.getAddrOffsetSectionBase()); + RangeListsSectionWriter->setAddressWriter(AddrW.get()); LocListWritersByCU[CUIndex] = - std::make_unique(CU, DwarfVersion, false); + std::make_unique(CU, DwarfVersion, false, *AddrW); if (std::optional DWOId = CU.getDWOId()) { assert(RangeListsWritersByCU.count(*DWOId) == 0 && "RangeLists writer for DWO unit already exists."); - auto RangeListsSectionWriter = + auto DWORangeListsSectionWriter = std::make_unique(); - RangeListsSectionWriter->initSection(CU); - RangeListsWritersByCU[*DWOId] = std::move(RangeListsSectionWriter); + DWORangeListsSectionWriter->initSection(CU); + DWORangeListsSectionWriter->setAddressWriter(AddrW.get()); + RangeListsWritersByCU[*DWOId] = std::move(DWORangeListsSectionWriter); } + AddressWritersByCU[CU.getOffset()] = std::move(AddrW); } else { + auto AddrW = + std::make_unique(&BC, CU.getAddressByteSize()); + AddressWritersByCU[CU.getOffset()] = std::move(AddrW); LocListWritersByCU[CUIndex] = std::make_unique(); if (std::optional DWOId = CU.getDWOId()) { assert(LegacyRangesWritersByCU.count(*DWOId) == 0 && "LegacyRangeLists writer for DWO unit already exists."); auto LegacyRangesSectionWriterByCU = std::make_unique(); + LegacyRangesSectionWriterByCU->initSection(CU); LegacyRangesWritersByCU[*DWOId] = std::move(LegacyRangesSectionWriterByCU); } @@ -674,10 +685,12 @@ void DWARFRewriter::updateDebugInfo() { std::optional DWOId = Unit->getDWOId(); if (DWOId) SplitCU = BC.getDWOCU(*DWOId); - DebugLocWriter *DebugLocWriter = createRangeLocList(*Unit); + DebugLocWriter *DebugLocWriter = createRangeLocListAddressWriters(*Unit); DebugRangesSectionWriter *RangesSectionWriter = Unit->getVersion() >= 5 ? RangeListsSectionWriter.get() : LegacyRangesSectionWriter.get(); + DebugAddrWriter *AddressWriter = + AddressWritersByCU[Unit->getOffset()].get(); // Skipping CUs that failed to load. if (SplitCU) { DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable, @@ -698,7 +711,8 @@ void DWARFRewriter::updateDebugInfo() { DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter, DWOStrWriter, **SplitCU, DwarfOutputPath, DWOName); - DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true); + DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true, + *AddressWriter); DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter; if (Unit->getVersion() >= 5) { TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get(); @@ -709,7 +723,7 @@ void DWARFRewriter::updateDebugInfo() { } updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter, - *TempRangesSectionWriter); + *TempRangesSectionWriter, *AddressWriter); DebugLocDWoWriter.finalize(DWODIEBuilder, *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU)); if (Unit->getVersion() >= 5) @@ -728,11 +742,10 @@ void DWARFRewriter::updateDebugInfo() { } updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter, - RangesBase); + *AddressWriter, RangesBase); DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit)); if (Unit->getVersion() >= 5) RangesSectionWriter->finalizeSection(); - AddrWriter->update(*DIEBlder, *Unit); }; DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable); @@ -758,7 +771,7 @@ void DWARFRewriter::updateDebugInfo() { for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) processUnitDIE(CU, &DIEBlder); finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap, - DIEBlder.getProcessedCUs()); + DIEBlder.getProcessedCUs(), *FinalAddrWriter); } } else { // Update unit debug info in parallel @@ -773,8 +786,8 @@ void DWARFRewriter::updateDebugInfo() { if (opts::WriteDWP) finalizeDWP(State); - finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS, - OffsetMap); + finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS, OffsetMap, + *FinalAddrWriter); GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex, *ARangesSectionWriter); } @@ -782,7 +795,7 @@ void DWARFRewriter::updateDebugInfo() { void DWARFRewriter::updateUnitDebugInfo( DWARFUnit &Unit, DIEBuilder &DIEBldr, DebugLocWriter &DebugLocWriter, DebugRangesSectionWriter &RangesSectionWriter, - std::optional RangesBase) { + DebugAddrWriter &AddressWriter, std::optional RangesBase) { // Cache debug ranges so that the offset for identical ranges could be reused. std::map CachedRanges; @@ -816,7 +829,7 @@ void DWARFRewriter::updateUnitDebugInfo( if (FormLowPC == dwarf::DW_FORM_addrx || FormLowPC == dwarf::DW_FORM_GNU_addr_index) - LowPC = AddrWriter->getIndexFromAddress(LowPC, Unit); + LowPC = AddressWriter.getIndexFromAddress(LowPC, Unit); if (LowPCVal) DIEBldr.replaceValue(Die, AttrLowPC, FormLowPC, DIEInteger(LowPC)); @@ -980,7 +993,7 @@ void DWARFRewriter::updateUnitDebugInfo( if (AttrVal.getForm() == dwarf::DW_FORM_addrx) { const uint32_t Index = - AddrWriter->getIndexFromAddress(UpdatedAddress, Unit); + AddressWriter.getIndexFromAddress(UpdatedAddress, Unit); DIEBldr.replaceValue(Die, AttrVal.getAttribute(), AttrVal.getForm(), DIEInteger(Index)); } else if (AttrVal.getForm() == dwarf::DW_FORM_addr) { @@ -1197,7 +1210,7 @@ void DWARFRewriter::updateUnitDebugInfo( assert(EntryAddress && "Address is not found."); assert(Index <= std::numeric_limits::max() && "Invalid Operand Index."); - const uint32_t AddrIndex = AddrWriter->getIndexFromAddress( + const uint32_t AddrIndex = AddressWriter.getIndexFromAddress( EntryAddress->Address, Unit); // update Index into .debug_address section for DW_AT_location. // The Size field is not stored in IR, we need to minus 1 in @@ -1249,7 +1262,7 @@ void DWARFRewriter::updateUnitDebugInfo( std::lock_guard Lock(DWARFRewriterMutex); if (Form == dwarf::DW_FORM_addrx || Form == dwarf::DW_FORM_GNU_addr_index) { - const uint32_t Index = AddrWriter->getIndexFromAddress( + const uint32_t Index = AddressWriter.getIndexFromAddress( NewAddress ? NewAddress : Address, Unit); DIEBldr.replaceValue(Die, LowPCAttrInfo.getAttribute(), LowPCAttrInfo.getForm(), DIEInteger(Index)); @@ -1512,7 +1525,8 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder, void DWARFRewriter::finalizeDebugSections( DIEBuilder &DIEBlder, DWARF5AcceleratorTable &DebugNamesTable, - DIEStreamer &Streamer, raw_svector_ostream &ObjOS, CUOffsetMap &CUMap) { + DIEStreamer &Streamer, raw_svector_ostream &ObjOS, CUOffsetMap &CUMap, + DebugAddrWriter &FinalAddrWriter) { if (StrWriter->isInitialized()) { RewriteInstance::addToDebugSectionsToOverwrite(".debug_str"); std::unique_ptr DebugStrSectionContents = @@ -1565,13 +1579,12 @@ void DWARFRewriter::finalizeDebugSections( LocationListSectionContents->size()); } - // AddrWriter should be finalized after debug_loc since more addresses can be - // added there. - if (AddrWriter->isInitialized()) { - AddressSectionBuffer AddressSectionContents = AddrWriter->finalize(); + if (FinalAddrWriter.isInitialized()) { + std::unique_ptr AddressSectionContents = + FinalAddrWriter.releaseBuffer(); BC.registerOrUpdateNoteSection(".debug_addr", - copyByteArray(AddressSectionContents), - AddressSectionContents.size()); + copyByteArray(*AddressSectionContents), + AddressSectionContents->size()); } Streamer.emitAbbrevs(DIEBlder.getAbbrevs(), BC.DwCtx->getMaxVersion()); @@ -1624,8 +1637,26 @@ void DWARFRewriter::finalizeDebugSections( void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer, CUOffsetMap &CUMap, - const std::list &CUs) { + const std::list &CUs, + DebugAddrWriter &FinalAddrWriter) { for (DWARFUnit *CU : CUs) { + auto AddressWriterIterator = AddressWritersByCU.find(CU->getOffset()); + assert(AddressWriterIterator != AddressWritersByCU.end() && + "AddressWriter does not exist for CU"); + DebugAddrWriter *AddressWriter = AddressWriterIterator->second.get(); + const size_t BufferOffset = FinalAddrWriter.getBufferSize(); + std::optional Offset = AddressWriter->finalize(BufferOffset); + /// If Offset already exists in UnmodifiedAddressOffsets, then update with + /// Offset, else update with BufferOffset. + if (Offset) + AddressWriter->updateAddrBase(DIEBlder, *CU, *Offset); + else if (AddressWriter->isInitialized()) + AddressWriter->updateAddrBase(DIEBlder, *CU, BufferOffset); + if (AddressWriter->isInitialized()) { + std::unique_ptr AddressSectionContents = + AddressWriter->releaseBuffer(); + FinalAddrWriter.appendToAddressBuffer(*AddressSectionContents); + } if (CU->getVersion() != 4) continue; std::optional DWOId = CU->getDWOId(); @@ -2155,6 +2186,10 @@ void DWARFRewriter::convertToRangesPatchDebugInfo( // when it's absent. if (IsUnitDie) { if (LowForm == dwarf::DW_FORM_addrx) { + auto AddrWriterIterator = AddressWritersByCU.find(Unit.getOffset()); + assert(AddrWriterIterator != AddressWritersByCU.end() && + "AddressWriter does not exist for CU"); + DebugAddrWriter *AddrWriter = AddrWriterIterator->second.get(); const uint32_t Index = AddrWriter->getIndexFromAddress(0, Unit); DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(), LowPCAttrInfo.getForm(), DIEInteger(Index)); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 37136f4a5c5518..e46c42533031cd 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -3241,12 +3241,6 @@ class X86MCPlusBuilder : public MCPlusBuilder { return Insts; } - InstructionListType createDummyReturnFunction(MCContext *Ctx) const override { - InstructionListType Insts(1); - createReturn(Insts[0]); - return Insts; - } - BlocksVectorTy indirectCallPromotion( const MCInst &CallInst, const std::vector> &Targets, diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index b9bc79f408a6b2..47375abb2ad3b4 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -105,6 +105,12 @@ cl::opt HeatmapMinAddress( cl::desc("minimum address considered valid for heatmap (default 0)"), cl::Optional, cl::cat(HeatmapCategory)); +cl::opt HeatmapPrintMappings( + "print-mappings", cl::init(false), + cl::desc("print mappings in the legend, between characters/blocks and text " + "sections (default false)"), + cl::Optional, cl::cat(HeatmapCategory)); + cl::opt HotData("hot-data", cl::desc("hot data symbols support (relocation mode)"), cl::cat(BoltCategory)); diff --git a/bolt/test/AArch64/dummy-return.s b/bolt/test/AArch64/dummy-return.s new file mode 100644 index 00000000000000..a4463431617301 --- /dev/null +++ b/bolt/test/AArch64/dummy-return.s @@ -0,0 +1,28 @@ +# REQUIRES: system-linux,target=aarch64{{.*}} + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static +# RUN: llvm-bolt -instrument -instrumentation-sleep-time=1 %t.exe \ +# RUN: -o %t.instr 2>&1 | FileCheck %s +# RUN: llvm-objdump --disassemble-symbols=__bolt_fini_trampoline %t.instr -D \ +# RUN: | FileCheck %s -check-prefix=CHECK-ASM + +# CHECK: BOLT-INFO: output linked against instrumentation runtime library +# CHECK-ASM: <__bolt_fini_trampoline>: +# CHECK-ASM-NEXT: ret + + .text + .align 4 + .global _start + .type _start, %function +_start: + bl foo + ret + .size _start, .-_start + + .global foo + .type foo, %function +foo: + mov w0, wzr + ret + .size foo, .-foo diff --git a/bolt/test/X86/dwarf5-addr-section-reuse.s b/bolt/test/X86/dwarf5-addr-section-reuse.s index 6b00ce0fdf8059..cf511d6d111e07 100644 --- a/bolt/test/X86/dwarf5-addr-section-reuse.s +++ b/bolt/test/X86/dwarf5-addr-section-reuse.s @@ -1,7 +1,7 @@ # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-main-addr-section-reuse.s -o %tmain.o # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-helper1-addr-section-reuse.s -o %thelper1.o # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-helper2-addr-section-reuse.s -o %thelper2.o -# RUN: %clang %cflags -dwarf-5 %tmain.o %thelper1.o %thelper2.o -o %t.exe -Wl,-q +# RUN: %clang %cflags -dwarf-5 %thelper1.o %tmain.o %thelper2.o -o %t.exe -Wl,-q # RUN: llvm-dwarfdump --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s # RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections # RUN: llvm-dwarfdump --debug-info %t.exe.bolt | FileCheck --check-prefix=POSTCHECK %s @@ -14,5 +14,5 @@ # PRECHECK: DW_AT_addr_base (0x00000008) # POSTCHECK: DW_AT_addr_base (0x00000008) -# POSTCHECK: DW_AT_addr_base (0x00000020) -# POSTCHECK: DW_AT_addr_base (0x00000020) +# POSTCHECK: DW_AT_addr_base (0x00000018) +# POSTCHECK: DW_AT_addr_base (0x00000008) diff --git a/bolt/test/X86/match-functions-with-calls-as-anchors.test b/bolt/test/X86/match-functions-with-calls-as-anchors.test index 7fef128453a075..984d614fbf85f9 100644 --- a/bolt/test/X86/match-functions-with-calls-as-anchors.test +++ b/bolt/test/X86/match-functions-with-calls-as-anchors.test @@ -1,6 +1,6 @@ ## Tests blocks matching by called function names in inferStaleProfile. -# REQUIRES: system-linux +# REQUIRES: system-linux, asserts # RUN: split-file %s %t # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp index 9351a1c90ae546..4022ea0cdaf5ee 100644 --- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp @@ -331,12 +331,15 @@ utils::UseRangesCheck::ReplacerMap UseRangesCheck::getReplacerMap() const { UseRangesCheck::UseRangesCheck(StringRef Name, ClangTidyContext *Context) : utils::UseRangesCheck(Name, Context), - IncludeBoostSystem(Options.get("IncludeBoostSystem", true)) {} + IncludeBoostSystem(Options.get("IncludeBoostSystem", true)), + UseReversePipe(Options.get("UseReversePipe", false)) {} void UseRangesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { utils::UseRangesCheck::storeOptions(Opts); Options.store(Opts, "IncludeBoostSystem", IncludeBoostSystem); + Options.store(Opts, "UseReversePipe", UseReversePipe); } + DiagnosticBuilder UseRangesCheck::createDiag(const CallExpr &Call) { DiagnosticBuilder D = diag(Call.getBeginLoc(), "use a %0 version of this algorithm"); @@ -362,10 +365,10 @@ UseRangesCheck::getReverseDescriptor() const { {"::boost::rbegin", "::boost::rend"}, {"::boost::const_rbegin", "::boost::const_rend"}, }; - return ReverseIteratorDescriptor{"boost::adaptors::reverse", - IncludeBoostSystem - ? "" - : "boost/range/adaptor/reversed.hpp", - Refs}; + return ReverseIteratorDescriptor{ + UseReversePipe ? "boost::adaptors::reversed" : "boost::adaptors::reverse", + IncludeBoostSystem ? "" + : "boost/range/adaptor/reversed.hpp", + Refs, UseReversePipe}; } } // namespace clang::tidy::boost diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h index a59ced12a6c438..b081c4c479b92a 100644 --- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h @@ -36,6 +36,7 @@ class UseRangesCheck : public utils::UseRangesCheck { private: bool IncludeBoostSystem; + bool UseReversePipe; }; } // namespace clang::tidy::boost diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp index f578f7ea71c08d..0b38b182081947 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp @@ -95,7 +95,7 @@ static bool haveSameNamespaceOrTranslationUnit(const CXXRecordDecl *Decl1, "ParentDecl2 declaration must be a namespace"); auto *Ns1 = NamespaceDecl::castFromDeclContext(ParentDecl1); auto *Ns2 = NamespaceDecl::castFromDeclContext(ParentDecl2); - return Ns1->getOriginalNamespace() == Ns2->getOriginalNamespace(); + return Ns1->getFirstDecl() == Ns2->getFirstDecl(); } static std::string getNameOfNamespace(const CXXRecordDecl *Decl) { diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp index f99beac668ce72..46bf20e34ce041 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp @@ -42,6 +42,7 @@ ImplicitWideningOfMultiplicationResultCheck:: UseCXXStaticCastsInCppSources( Options.get("UseCXXStaticCastsInCppSources", true)), UseCXXHeadersInCppSources(Options.get("UseCXXHeadersInCppSources", true)), + IgnoreConstantIntExpr(Options.get("IgnoreConstantIntExpr", false)), IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", utils::IncludeSorter::IS_LLVM), areDiagsSelfContained()) {} @@ -56,6 +57,7 @@ void ImplicitWideningOfMultiplicationResultCheck::storeOptions( Options.store(Opts, "UseCXXStaticCastsInCppSources", UseCXXStaticCastsInCppSources); Options.store(Opts, "UseCXXHeadersInCppSources", UseCXXHeadersInCppSources); + Options.store(Opts, "IgnoreConstantIntExpr", IgnoreConstantIntExpr); Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); } @@ -84,6 +86,19 @@ void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr( if (TgtWidth <= SrcWidth) return; + // Is the expression a compile-time constexpr that we know can fit in the + // source type? + if (IgnoreConstantIntExpr && ETy->isIntegerType() && + !ETy->isUnsignedIntegerType()) { + if (const auto ConstExprResult = E->getIntegerConstantExpr(*Context)) { + const auto TypeSize = Context->getTypeSize(ETy); + llvm::APSInt WidenedResult = ConstExprResult->extOrTrunc(TypeSize); + if (WidenedResult <= llvm::APSInt::getMaxValue(TypeSize, false) && + WidenedResult >= llvm::APSInt::getMinValue(TypeSize, false)) + return; + } + } + // Does the index expression look like it might be unintentionally computed // in a narrower-than-wanted type? const Expr *LHS = getLHSOfMulBinOp(E); diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h index 8b99930ae7a899..077a4b847cd9c5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h @@ -41,6 +41,7 @@ class ImplicitWideningOfMultiplicationResultCheck : public ClangTidyCheck { private: const bool UseCXXStaticCastsInCppSources; const bool UseCXXHeadersInCppSources; + const bool IgnoreConstantIntExpr; utils::IncludeInserter IncludeInserter; }; diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp index 5c7b315f43173b..b0a31ad53be3f7 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.cpp @@ -166,6 +166,15 @@ utils::UseRangesCheck::ReplacerMap UseRangesCheck::getReplacerMap() const { return Result; } +UseRangesCheck::UseRangesCheck(StringRef Name, ClangTidyContext *Context) + : utils::UseRangesCheck(Name, Context), + UseReversePipe(Options.get("UseReversePipe", false)) {} + +void UseRangesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { + utils::UseRangesCheck::storeOptions(Opts); + Options.store(Opts, "UseReversePipe", UseReversePipe); +} + bool UseRangesCheck::isLanguageVersionSupported( const LangOptions &LangOpts) const { return LangOpts.CPlusPlus20; @@ -180,6 +189,8 @@ std::optional UseRangesCheck::getReverseDescriptor() const { static const std::pair Refs[] = { {"::std::rbegin", "::std::rend"}, {"::std::crbegin", "::std::crend"}}; - return ReverseIteratorDescriptor{"std::views::reverse", "", Refs}; + return ReverseIteratorDescriptor{UseReversePipe ? "std::views::reverse" + : "std::ranges::reverse_view", + "", Refs, UseReversePipe}; } } // namespace clang::tidy::modernize diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h index 2f7613dd1cd246..2f4cace653cf19 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h @@ -20,7 +20,9 @@ namespace clang::tidy::modernize { /// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-ranges.html class UseRangesCheck : public utils::UseRangesCheck { public: - using utils::UseRangesCheck::UseRangesCheck; + UseRangesCheck(StringRef CheckName, ClangTidyContext *Context); + + void storeOptions(ClangTidyOptions::OptionMap &Options) override; ReplacerMap getReplacerMap() const override; @@ -31,6 +33,9 @@ class UseRangesCheck : public utils::UseRangesCheck { getReverseDescriptor() const override; bool isLanguageVersionSupported(const LangOptions &LangOpts) const override; + +private: + bool UseReversePipe; }; } // namespace clang::tidy::modernize diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp index c507043c367a86..5a4c2363bd8af0 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp @@ -75,7 +75,7 @@ void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) { functionDecl(hasBody(stmt()), isDefinition(), unless(isImplicit()), unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))), has(typeLoc(forEach(ExpensiveValueParamDecl))), - unless(isInstantiated()), decl().bind("functionDecl"))), + decl().bind("functionDecl"))), this); } @@ -133,12 +133,11 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) { // 2. the function is virtual as it might break overrides // 3. the function is referenced outside of a call expression within the // compilation unit as the signature change could introduce build errors. - // 4. the function is a primary template or an explicit template - // specialization. + // 4. the function is an explicit template/ specialization. const auto *Method = llvm::dyn_cast(Function); if (Param->getBeginLoc().isMacroID() || (Method && Method->isVirtual()) || isReferencedOutsideOfCallExpr(*Function, *Result.Context) || - (Function->getTemplatedKind() != FunctionDecl::TK_NonTemplate)) + Function->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) return; for (const auto *FunctionDecl = Function; FunctionDecl != nullptr; FunctionDecl = FunctionDecl->getPreviousDecl()) { diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp index 9c59e4651953ac..e2daa5010e2aeb 100644 --- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp @@ -242,16 +242,20 @@ void UseRangesCheck::check(const MatchFinder::MatchResult &Result) { Diag << Inserter.createIncludeInsertion( Result.SourceManager->getFileID(Call->getBeginLoc()), *ReverseDescriptor->ReverseHeader); + StringRef ArgText = Lexer::getSourceText( + CharSourceRange::getTokenRange(ArgExpr->getSourceRange()), + Result.Context->getSourceManager(), Result.Context->getLangOpts()); + SmallString<128> ReplaceText; + if (ReverseDescriptor->IsPipeSyntax) + ReplaceText.assign( + {ArgText, " | ", ReverseDescriptor->ReverseAdaptorName}); + else + ReplaceText.assign( + {ReverseDescriptor->ReverseAdaptorName, "(", ArgText, ")"}); Diag << FixItHint::CreateReplacement( Call->getArg(Replace == Indexes::Second ? Second : First) ->getSourceRange(), - SmallString<128>{ - ReverseDescriptor->ReverseAdaptorName, "(", - Lexer::getSourceText( - CharSourceRange::getTokenRange(ArgExpr->getSourceRange()), - Result.Context->getSourceManager(), - Result.Context->getLangOpts()), - ")"}); + ReplaceText); } ToRemove.push_back(Replace == Indexes::Second ? First : Second); } diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h index 8227d8f7bbbddf..927e9694b0ec7c 100644 --- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h @@ -38,6 +38,7 @@ class UseRangesCheck : public ClangTidyCheck { StringRef ReverseAdaptorName; std::optional ReverseHeader; ArrayRef> FreeReverseNames; + bool IsPipeSyntax = false; }; class Replacer : public llvm::RefCountedBase { diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index c1fa502534ea52..004811d2eca4f4 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -245,6 +245,11 @@ Changes in existing checks ` check to ignore deleted constructors which won't hide other overloads. +- Improved :doc:`bugprone-implicit-widening-of-multiplication-result + ` check + by adding an option to ignore constant expressions of signed integer types + that fit in the source expression type. + - Improved :doc:`bugprone-inc-dec-in-conditions ` check to ignore code within unevaluated contexts, such as ``decltype``. @@ -437,6 +442,12 @@ Changes in existing checks Calls to mutable function where there exists a `const` overload are also handled. Fix crash in the case of a non-member operator call. +- Improved :doc:`performance-unnecessary-value-param + ` check + detecting more cases for template functions including lambdas with ``auto``. + E.g., ``std::sort(a.begin(), a.end(), [](auto x, auto y) { return a > b; });`` + will be detected for expensive to copy types. + - Improved :doc:`readability-avoid-return-with-void-value ` check by adding fix-its. diff --git a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst index 39be52fdcf7b91..4c032ad32f4fd8 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst @@ -154,8 +154,8 @@ Transforms to: .. code-block:: c++ - auto AreSame = std::equal(boost::adaptors::reverse(Items1), - boost::adaptors::reverse(Items2)); + auto AreSame = boost::range::equal(boost::adaptors::reverse(Items1), + boost::adaptors::reverse(Items2)); Options ------- @@ -170,3 +170,18 @@ Options If `true` (default value) the boost headers are included as system headers with angle brackets (`#include `), otherwise quotes are used (`#include "boost.hpp"`). + +.. option:: UseReversePipe + + When `true` (default `false`), fixes which involve reverse ranges will use the + pipe adaptor syntax instead of the function syntax. + + .. code-block:: c++ + + std::find(Items.rbegin(), Items.rend(), 0); + + Transforms to: + + .. code-block:: c++ + + boost::range::find(Items | boost::adaptors::reversed, 0); diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.rst index c4ddd02602b73d..117310d404f6f4 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.rst @@ -45,6 +45,12 @@ Options should ```` header be suggested, or ````. Defaults to ``true``. +.. option:: IgnoreConstantIntExpr + + If the multiplication operands are compile-time constants (like literals or + are ``constexpr``) and fit within the source expression type, do not emit a + diagnostic or suggested fix. Only considers expressions where the source + expression is a signed integer type. Defaults to ``false``. Examples: diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst index 86af6b0eeb8e03..5c0b8058e4535f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst @@ -116,8 +116,8 @@ Transforms to: .. code-block:: c++ - auto AreSame = std::equal(std::views::reverse(Items1), - std::views::reverse(Items2)); + auto AreSame = std::ranges::equal(std::ranges::reverse_view(Items1), + std::ranges::reverse_view(Items2)); Options ------- @@ -127,3 +127,17 @@ Options A string specifying which include-style is used, `llvm` or `google`. Default is `llvm`. +.. option:: UseReversePipe + + When `true` (default `false`), fixes which involve reverse ranges will use the + pipe adaptor syntax instead of the function syntax. + + .. code-block:: c++ + + std::find(Items.rbegin(), Items.rend(), 0); + + Transforms to: + + .. code-block:: c++ + + std::ranges::find(Items | std::views::reverse, 0); diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test index bab5f8e1761bc6..51d3ac6ce6dcdb 100644 --- a/clang-tools-extra/test/clang-doc/basic-project.test +++ b/clang-tools-extra/test/clang-doc/basic-project.test @@ -54,306 +54,181 @@ // JSON-INDEX-NEXT: }; // JSON-INDEX-NEXT: } -// HTML-SHAPE: -// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT: class Shape -// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

class Shape

-// HTML-SHAPE-NEXT:

Defined at line 8 of file {{.*}}Shape.h

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

Provides a common interface for different types of shapes.

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

Functions

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

~Shape

-// HTML-SHAPE-NEXT:

public void ~Shape()

-// HTML-SHAPE-NEXT:

Defined at line 13 of file {{.*}}Shape.h

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

area

-// HTML-SHAPE-NEXT:

public double area()

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:

perimeter

-// HTML-SHAPE-NEXT:

public double perimeter()

-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT:
-// HTML-SHAPE-NEXT: -// HTML-SHAPE-NEXT:
+// HTML-SHAPE:

class Shape

+// HTML-SHAPE:

Defined at line 8 of file {{.*}}Shape.h

+// HTML-SHAPE:

Provides a common interface for different types of shapes.

+// HTML-SHAPE:

Functions

+// HTML-SHAPE:

~Shape

+// HTML-SHAPE:

public void ~Shape()

+// HTML-SHAPE:

Defined at line 13 of file {{.*}}Shape.h

+// HTML-SHAPE:

area

+// HTML-SHAPE:

public double area()

+// HTML-SHAPE:

perimeter

+// HTML-SHAPE:

public double perimeter()

-// HTML-CALC: -// HTML-CALC-NEXT: -// HTML-CALC-NEXT: class Calculator -// HTML-CALC-NEXT: -// HTML-CALC-NEXT: -// HTML-CALC-NEXT: -// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT: -// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

class Calculator

-// HTML-CALC-NEXT:

Defined at line 8 of file {{.*}}Calculator.h

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

Provides basic arithmetic operations.

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

Functions

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

add

-// HTML-CALC-NEXT:

public int add(int a, int b)

-// HTML-CALC-NEXT:

Defined at line 3 of file {{.*}}Calculator.cpp

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

subtract

-// HTML-CALC-NEXT:

public int subtract(int a, int b)

-// HTML-CALC-NEXT:

Defined at line 7 of file {{.*}}Calculator.cpp

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

multiply

-// HTML-CALC-NEXT:

public int multiply(int a, int b)

-// HTML-CALC-NEXT:

Defined at line 11 of file {{.*}}Calculator.cpp

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:

divide

-// HTML-CALC-NEXT:

public double divide(int a, int b)

-// HTML-CALC-NEXT:

Defined at line 15 of file {{.*}}Calculator.cpp

-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT:
-// HTML-CALC-NEXT: -// HTML-CALC-NEXT:
+// HTML-CALC:

class Calculator

+// HTML-CALC:

Defined at line 8 of file {{.*}}Calculator.h

+// HTML-CALC:

Provides basic arithmetic operations.

+// HTML-CALC:

Functions

+// HTML-CALC:

add

+// HTML-CALC:

public int add(int a, int b)

+// HTML-CALC:

Defined at line 3 of file {{.*}}Calculator.cpp

+// HTML-CALC:

subtract

+// HTML-CALC:

public int subtract(int a, int b)

+// HTML-CALC:

Defined at line 7 of file {{.*}}Calculator.cpp

+// HTML-CALC:

multiply

+// HTML-CALC:

public int multiply(int a, int b)

+// HTML-CALC:

Defined at line 11 of file {{.*}}Calculator.cpp

+// HTML-CALC:

divide

+// HTML-CALC:

public double divide(int a, int b)

+// HTML-CALC:

Defined at line 15 of file {{.*}}Calculator.cpp

-// HTML-RECTANGLE: -// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT: class Rectangle -// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

class Rectangle

-// HTML-RECTANGLE-NEXT:

Defined at line 10 of file {{.*}}Rectangle.h

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

Represents a rectangle with a given width and height.

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

-// HTML-RECTANGLE-NEXT: Inherits from -// HTML-RECTANGLE-NEXT: Shape -// HTML-RECTANGLE-NEXT:

-// HTML-RECTANGLE-NEXT:

Members

-// HTML-RECTANGLE-NEXT:
    -// HTML-RECTANGLE-NEXT:
  • private double width_
  • -// HTML-RECTANGLE-NEXT:
  • private double height_
  • -// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

Functions

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

Rectangle

-// HTML-RECTANGLE-NEXT:

public void Rectangle(double width, double height)

-// HTML-RECTANGLE-NEXT:

Defined at line 3 of file {{.*}}Rectangle.cpp

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

area

-// HTML-RECTANGLE-NEXT:

public double area()

-// HTML-RECTANGLE-NEXT:

Defined at line 6 of file {{.*}}Rectangle.cpp

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:

perimeter

-// HTML-RECTANGLE-NEXT:

public double perimeter()

-// HTML-RECTANGLE-NEXT:

Defined at line 10 of file {{.*}}Rectangle.cpp

-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT:
-// HTML-RECTANGLE-NEXT: -// HTML-RECTANGLE-NEXT:
+// HTML-RECTANGLE:

class Rectangle

+// HTML-RECTANGLE:

Defined at line 10 of file {{.*}}Rectangle.h

+// HTML-RECTANGLE:

Represents a rectangle with a given width and height.

+// HTML-RECTANGLE: Inherits from +// HTML-RECTANGLE: Shape +// HTML-RECTANGLE:

+// HTML-RECTANGLE:

Members

+// HTML-RECTANGLE:
  • private double width_
  • +// HTML-RECTANGLE:
  • private double height_
  • +// HTML-RECTANGLE:

    Functions

    +// HTML-RECTANGLE:

    Rectangle

    +// HTML-RECTANGLE:

    public void Rectangle(double width, double height)

    +// HTML-RECTANGLE:

    Defined at line 3 of file {{.*}}Rectangle.cpp

    +// HTML-RECTANGLE:

    area

    +// HTML-RECTANGLE:

    public double area()

    +// HTML-RECTANGLE:

    Defined at line 6 of file {{.*}}Rectangle.cpp

    +// HTML-RECTANGLE:

    perimeter

    +// HTML-RECTANGLE:

    public double perimeter()

    +// HTML-RECTANGLE:

    Defined at line 10 of file {{.*}}Rectangle.cpp

    -// HTML-CIRCLE: -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT: class Circle -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    class Circle

    -// HTML-CIRCLE-NEXT:

    Defined at line 10 of file {{.*}}Circle.h

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    Represents a circle with a given radius.

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    -// HTML-CIRCLE-NEXT: Inherits from -// HTML-CIRCLE-NEXT: Shape -// HTML-CIRCLE-NEXT:

    -// HTML-CIRCLE-NEXT:

    Members

    -// HTML-CIRCLE-NEXT:
      -// HTML-CIRCLE-NEXT:
    • private double radius_
    • -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    Functions

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    Circle

    -// HTML-CIRCLE-NEXT:

    public void Circle(double radius)

    -// HTML-CIRCLE-NEXT:

    Defined at line 3 of file {{.*}}Circle.cpp

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    area

    -// HTML-CIRCLE-NEXT:

    public double area()

    -// HTML-CIRCLE-NEXT:

    Defined at line 5 of file {{.*}}Circle.cpp

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:

    perimeter

    -// HTML-CIRCLE-NEXT:

    public double perimeter()

    -// HTML-CIRCLE-NEXT:

    Defined at line 9 of file {{.*}}Circle.cpp

    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT:
    -// HTML-CIRCLE-NEXT: -// HTML-CIRCLE-NEXT:
    \ No newline at end of file +// HTML-CIRCLE:

    class Circle

    +// HTML-CIRCLE:

    Defined at line 10 of file {{.*}}Circle.h

    +// HTML-CIRCLE:

    Represents a circle with a given radius.

    +// HTML-CIRCLE:

    +// HTML-CIRCLE: Inherits from +// HTML-CIRCLE: Shape +// HTML-CIRCLE:

    +// HTML-CIRCLE:

    Members

    +// HTML-CIRCLE:
  • private double radius_
  • +// HTML-CIRCLE:

    Functions

    +// HTML-CIRCLE:

    Circle

    +// HTML-CIRCLE:

    public void Circle(double radius)

    +// HTML-CIRCLE:

    Defined at line 3 of file {{.*}}Circle.cpp

    +// HTML-CIRCLE:

    area

    +// HTML-CIRCLE:

    public double area()

    +// HTML-CIRCLE:

    Defined at line 5 of file {{.*}}Circle.cpp

    +// HTML-CIRCLE:

    perimeter

    +// HTML-CIRCLE:

    public double perimeter()

    +// HTML-CIRCLE:

    Defined at line 9 of file {{.*}}Circle.cpp

    + +// MD-CALC: # class Calculator +// MD-CALC: *Defined at .{{[\/]}}include{{[\/]}}Calculator.h#8* +// MD-CALC: **brief** A simple calculator class. +// MD-CALC: Provides basic arithmetic operations. +// MD-CALC: ## Functions +// MD-CALC: ### add +// MD-CALC: *public int add(int a, int b)* +// MD-CALC: *Defined at .{{[\/]}}src{{[\/]}}Calculator.cpp#3* +// MD-CALC: **brief** Adds two integers. +// MD-CALC: **a** First integer. +// MD-CALC: **b** Second integer. +// MD-CALC: **return** int The sum of a and b. +// MD-CALC: ### subtract +// MD-CALC: *public int subtract(int a, int b)* +// MD-CALC: *Defined at .{{[\/]}}src{{[\/]}}Calculator.cpp#7* +// MD-CALC: **brief** Subtracts the second integer from the first. +// MD-CALC: **a** First integer. +// MD-CALC: **b** Second integer. +// MD-CALC: **return** int The result of a - b. +// MD-CALC: ### multiply +// MD-CALC: *public int multiply(int a, int b)* +// MD-CALC: *Defined at .{{[\/]}}src{{[\/]}}Calculator.cpp#11* +// MD-CALC: **brief** Multiplies two integers. +// MD-CALC: **a** First integer. +// MD-CALC: **b** Second integer. +// MD-CALC: **return** int The product of a and b. +// MD-CALC: ### divide +// MD-CALC: *public double divide(int a, int b)* +// MD-CALC: *Defined at .{{[\/]}}src{{[\/]}}Calculator.cpp#15* +// MD-CALC: **brief** Divides the first integer by the second. +// MD-CALC: **a** First integer. +// MD-CALC: **b** Second integer. +// MD-CALC: **return** double The result of a / b. +// MD-CALC: **throw**if b is zero. + +// MD-CIRCLE: # class Circle +// MD-CIRCLE: *Defined at .{{[\/]}}include{{[\/]}}Circle.h#10* +// MD-CIRCLE: **brief** Circle class derived from Shape. +// MD-CIRCLE: Represents a circle with a given radius. +// MD-CIRCLE: Inherits from Shape +// MD-CIRCLE: ## Members +// MD-CIRCLE: private double radius_ +// MD-CIRCLE: ## Functions +// MD-CIRCLE: ### Circle +// MD-CIRCLE: *public void Circle(double radius)* +// MD-CIRCLE: *Defined at .{{[\/]}}src{{[\/]}}Circle.cpp#3* +// MD-CIRCLE: **brief** Constructs a new Circle object. +// MD-CIRCLE: **radius** Radius of the circle. +// MD-CIRCLE: ### area +// MD-CIRCLE: *public double area()* +// MD-CIRCLE: *Defined at .{{[\/]}}src{{[\/]}}Circle.cpp#5* +// MD-CIRCLE: **brief** Calculates the area of the circle. +// MD-CIRCLE: **return** double The area of the circle. +// MD-CIRCLE: ### perimeter +// MD-CIRCLE: *public double perimeter()* +// MD-CIRCLE: *Defined at .{{[\/]}}src{{[\/]}}Circle.cpp#9* +// MD-CIRCLE: **brief** Calculates the perimeter of the circle. +// MD-CIRCLE: **return** double The perimeter of the circle. + +// MD-RECTANGLE: # class Rectangle +// MD-RECTANGLE: *Defined at .{{[\/]}}include{{[\/]}}Rectangle.h#10* +// MD-RECTANGLE: **brief** Rectangle class derived from Shape. +// MD-RECTANGLE: Represents a rectangle with a given width and height. +// MD-RECTANGLE: Inherits from Shape +// MD-RECTANGLE: ## Members +// MD-RECTANGLE: private double width_ +// MD-RECTANGLE: private double height_ +// MD-RECTANGLE: ## Functions +// MD-RECTANGLE: ### Rectangle +// MD-RECTANGLE: *public void Rectangle(double width, double height)* +// MD-RECTANGLE: *Defined at .{{[\/]}}src{{[\/]}}Rectangle.cpp#3* +// MD-RECTANGLE: **brief** Constructs a new Rectangle object. +// MD-RECTANGLE: **width** Width of the rectangle. +// MD-RECTANGLE: **height** Height of the rectangle. +// MD-RECTANGLE: ### area +// MD-RECTANGLE: *public double area()* +// MD-RECTANGLE: *Defined at .{{[\/]}}src{{[\/]}}Rectangle.cpp#6* +// MD-RECTANGLE: **brief** Calculates the area of the rectangle. +// MD-RECTANGLE: **return** double The area of the rectangle. +// MD-RECTANGLE: ### perimeter +// MD-RECTANGLE: *public double perimeter()* +// MD-RECTANGLE: *Defined at .{{[\/]}}src{{[\/]}}Rectangle.cpp#10* +// MD-RECTANGLE: **brief** Calculates the perimeter of the rectangle. +// MD-RECTANGLE: **return** double The perimeter of the rectangle. + +// MD-SHAPE: # class Shape +// MD-SHAPE: *Defined at .{{[\/]}}include{{[\/]}}Shape.h#8* +// MD-SHAPE: **brief** Abstract base class for shapes. +// MD-SHAPE: Provides a common interface for different types of shapes. +// MD-SHAPE: ## Functions +// MD-SHAPE: ### ~Shape +// MD-SHAPE: *public void ~Shape()* +// MD-SHAPE: *Defined at .{{[\/]}}include{{[\/]}}Shape.h#13* +// MD-SHAPE: **brief** Virtual destructor. +// MD-SHAPE: ### area +// MD-SHAPE: *public double area()* +// MD-SHAPE: **brief** Calculates the area of the shape. +// MD-SHAPE: **return** double The area of the shape. +// MD-SHAPE: ### perimeter +// MD-SHAPE: *public double perimeter()* +// MD-SHAPE: **brief** Calculates the perimeter of the shape. +// MD-SHAPE: **return** double The perimeter of the shape. + +// MD-ALL-FILES: # All Files +// MD-ALL-FILES: ## [GlobalNamespace](GlobalNamespace{{[\/]}}index.md) + +// MD-INDEX: # C/C++ Reference +// MD-INDEX: * Namespace: [GlobalNamespace](GlobalNamespace) \ No newline at end of file diff --git a/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_boost.h b/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_boost.h new file mode 100644 index 00000000000000..3664367a601109 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_boost.h @@ -0,0 +1,29 @@ +#ifndef USE_RANGES_FAKE_BOOST_H +#define USE_RANGES_FAKE_BOOST_H + +namespace boost { +namespace range_adl_barrier { + +template void *begin(T &); +template void *end(T &); +template void *const_begin(const T &); +template void *const_end(const T &); +} // namespace range_adl_barrier + +using namespace range_adl_barrier; + +template void *rbegin(T &); +template void *rend(T &); + +template void *const_rbegin(T &); +template void *const_rend(T &); +namespace algorithm { + +template +T reduce(InputIterator first, InputIterator last, T init, BinaryOperation bOp) { + return init; +} +} // namespace algorithm +} // namespace boost + +#endif // USE_RANGES_FAKE_BOOST_H diff --git a/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_std.h b/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_std.h new file mode 100644 index 00000000000000..7c3e39d6000d20 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/boost/Inputs/use-ranges/fake_std.h @@ -0,0 +1,99 @@ +#ifndef USE_RANGES_FAKE_STD_H +#define USE_RANGES_FAKE_STD_H +namespace std { + +template class vector { +public: + using iterator = T *; + using const_iterator = const T *; + using reverse_iterator = T*; + using reverse_const_iterator = const T*; + + constexpr const_iterator begin() const; + constexpr const_iterator end() const; + constexpr const_iterator cbegin() const; + constexpr const_iterator cend() const; + constexpr iterator begin(); + constexpr iterator end(); + constexpr reverse_const_iterator rbegin() const; + constexpr reverse_const_iterator rend() const; + constexpr reverse_const_iterator crbegin() const; + constexpr reverse_const_iterator crend() const; + constexpr reverse_iterator rbegin(); + constexpr reverse_iterator rend(); +}; + +template constexpr auto begin(const Container &Cont) { + return Cont.begin(); +} + +template constexpr auto begin(Container &Cont) { + return Cont.begin(); +} + +template constexpr auto end(const Container &Cont) { + return Cont.end(); +} + +template constexpr auto end(Container &Cont) { + return Cont.end(); +} + +template constexpr auto cbegin(const Container &Cont) { + return Cont.cbegin(); +} + +template constexpr auto cend(const Container &Cont) { + return Cont.cend(); +} +// Find +template< class InputIt, class T > +InputIt find(InputIt first, InputIt last, const T& value); + +template void reverse(Iter begin, Iter end); + +template +bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); + +template +bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, + ForwardIt2 last2); + +template +bool next_permutation(BidirIt first, BidirIt last); + +inline namespace inline_test{ + +template +bool equal(ForwardIt1 first1, ForwardIt1 last1, + ForwardIt2 first2, ForwardIt2 last2); + +template +void push_heap(RandomIt first, RandomIt last); + +template +OutputIt copy_if(InputIt first, InputIt last, OutputIt d_first, UnaryPred pred); + +template +ForwardIt is_sorted_until(ForwardIt first, ForwardIt last); + +template +void reduce(InputIt first, InputIt last); + +template +T reduce(InputIt first, InputIt last, T init); + +template +T reduce(InputIt first, InputIt last, T init, BinaryOp op) { + // Need a definition to suppress undefined_internal_type when invoked with lambda + return init; +} + +template +T accumulate(InputIt first, InputIt last, T init); + +} // namespace inline_test + +} // namespace std + +#endif // USE_RANGES_FAKE_STD_H diff --git a/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges-pipe.cpp b/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges-pipe.cpp new file mode 100644 index 00000000000000..c0ce3748400981 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges-pipe.cpp @@ -0,0 +1,18 @@ +// RUN: %check_clang_tidy -std=c++14 %s boost-use-ranges %t -check-suffixes=,PIPE \ +// RUN: -config="{CheckOptions: { \ +// RUN: boost-use-ranges.UseReversePipe: true }}" -- -I %S/Inputs/use-ranges/ +// RUN: %check_clang_tidy -std=c++14 %s boost-use-ranges %t -check-suffixes=,NOPIPE -- -I %S/Inputs/use-ranges/ + +// CHECK-FIXES: #include +// CHECK-FIXES: #include + +#include "fake_std.h" + +void stdLib() { + std::vector I; + std::is_sorted_until(I.rbegin(), I.rend()); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a boost version of this algorithm + // CHECK-FIXES-NOPIPE: boost::algorithm::is_sorted_until(boost::adaptors::reverse(I)); + // CHECK-FIXES-PIPE: boost::algorithm::is_sorted_until(I | boost::adaptors::reversed); + +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges.cpp b/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges.cpp index 3f3d6f1abec9f4..06e70267da83a7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/boost/use-ranges.cpp @@ -1,5 +1,5 @@ -// RUN: %check_clang_tidy -std=c++14 %s boost-use-ranges %t -// RUN: %check_clang_tidy -std=c++17 %s boost-use-ranges %t -check-suffixes=,CPP17 +// RUN: %check_clang_tidy -std=c++14 %s boost-use-ranges %t -- -- -I %S/Inputs/use-ranges/ +// RUN: %check_clang_tidy -std=c++17 %s boost-use-ranges %t -check-suffixes=,CPP17 -- -I %S/Inputs/use-ranges/ // CHECK-FIXES: #include // CHECK-FIXES: #include @@ -13,111 +13,8 @@ // CHECK-FIXES: #include // CHECK-FIXES: #include -namespace std { - -template class vector { -public: - using iterator = T *; - using const_iterator = const T *; - constexpr const_iterator begin() const; - constexpr const_iterator end() const; - constexpr const_iterator cbegin() const; - constexpr const_iterator cend() const; - constexpr iterator begin(); - constexpr iterator end(); -}; - -template constexpr auto begin(const Container &Cont) { - return Cont.begin(); -} - -template constexpr auto begin(Container &Cont) { - return Cont.begin(); -} - -template constexpr auto end(const Container &Cont) { - return Cont.end(); -} - -template constexpr auto end(Container &Cont) { - return Cont.end(); -} - -template constexpr auto cbegin(const Container &Cont) { - return Cont.cbegin(); -} - -template constexpr auto cend(const Container &Cont) { - return Cont.cend(); -} -// Find -template< class InputIt, class T > -InputIt find(InputIt first, InputIt last, const T& value); - -template void reverse(Iter begin, Iter end); - -template -bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); - -template -bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, - ForwardIt2 last2); - -template -bool next_permutation(BidirIt first, BidirIt last); - -template -bool equal(ForwardIt1 first1, ForwardIt1 last1, - ForwardIt2 first2, ForwardIt2 last2); - -template -void push_heap(RandomIt first, RandomIt last); - -template -OutputIt copy_if(InputIt first, InputIt last, OutputIt d_first, UnaryPred pred); - -template -ForwardIt is_sorted_until(ForwardIt first, ForwardIt last); - -template -void reduce(InputIt first, InputIt last); - -template -T reduce(InputIt first, InputIt last, T init); - -template -T reduce(InputIt first, InputIt last, T init, BinaryOp op) { - // Need a definition to suppress undefined_internal_type when invoked with lambda - return init; -} - -template -T accumulate(InputIt first, InputIt last, T init); - -} // namespace std - -namespace boost { -namespace range_adl_barrier { -template void *begin(T &); -template void *end(T &); -template void *const_begin(const T &); -template void *const_end(const T &); -} // namespace range_adl_barrier -using namespace range_adl_barrier; - -template void *rbegin(T &); -template void *rend(T &); - -template void *const_rbegin(T &); -template void *const_rend(T &); -namespace algorithm { - -template -T reduce(InputIterator first, InputIterator last, T init, BinaryOperation bOp) { - return init; -} -} // namespace algorithm -} // namespace boost +#include "fake_boost.h" +#include "fake_std.h" bool returnTrue(int val) { return true; diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/implicit-widening-of-multiplication-result-constants.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/implicit-widening-of-multiplication-result-constants.cpp new file mode 100644 index 00000000000000..d7ab8a7a44fe68 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/implicit-widening-of-multiplication-result-constants.cpp @@ -0,0 +1,56 @@ +// RUN: %check_clang_tidy %s bugprone-implicit-widening-of-multiplication-result %t -- \ +// RUN: -config='{CheckOptions: { \ +// RUN: bugprone-implicit-widening-of-multiplication-result.IgnoreConstantIntExpr: true \ +// RUN: }}' -- -target x86_64-unknown-unknown -x c++ + +long t0() { + return 1 * 4; +} + +unsigned long t1() { + const int a = 2; + const int b = 3; + return a * b; +} + +long t2() { + constexpr int a = 16383; // ~1/2 of int16_t max + constexpr int b = 2; + return a * b; +} + +constexpr int global_value() { + return 16; +} + +unsigned long t3() { + constexpr int a = 3; + return a * global_value(); +} + +long t4() { + const char a = 3; + const short b = 2; + const int c = 5; + return c * b * a; +} + +long t5() { + constexpr int min_int = (-2147483647 - 1); // A literal of -2147483648 evaluates to long + return 1 * min_int; +} + +unsigned long n0() { + const int a = 1073741824; // 1/2 of int32_t max + const int b = 3; + return a * b; + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: performing an implicit widening conversion to type 'unsigned long' of a multiplication performed in type 'int' + // CHECK-MESSAGES: :[[@LINE-2]]:10: note: make conversion explicit to silence this warning + // CHECK-MESSAGES: static_cast( ) + // CHECK-MESSAGES: :[[@LINE-4]]:10: note: perform multiplication in a wider type +} + +double n1() { + const long a = 100000000; + return a * 400; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h new file mode 100644 index 00000000000000..987ee4e35b3bcd --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h @@ -0,0 +1,111 @@ +#ifndef USE_RANGES_FAKE_STD_H +#define USE_RANGES_FAKE_STD_H + +namespace std { + +template class vector { +public: + using iterator = T *; + using const_iterator = const T *; + using reverse_iterator = T*; + using reverse_const_iterator = const T*; + + constexpr const_iterator begin() const; + constexpr const_iterator end() const; + constexpr const_iterator cbegin() const; + constexpr const_iterator cend() const; + constexpr iterator begin(); + constexpr iterator end(); + constexpr reverse_const_iterator rbegin() const; + constexpr reverse_const_iterator rend() const; + constexpr reverse_const_iterator crbegin() const; + constexpr reverse_const_iterator crend() const; + constexpr reverse_iterator rbegin(); + constexpr reverse_iterator rend(); +}; + +template constexpr auto begin(const Container &Cont) { + return Cont.begin(); +} + +template constexpr auto begin(Container &Cont) { + return Cont.begin(); +} + +template constexpr auto end(const Container &Cont) { + return Cont.end(); +} + +template constexpr auto end(Container &Cont) { + return Cont.end(); +} + +template constexpr auto cbegin(const Container &Cont) { + return Cont.cbegin(); +} + +template constexpr auto cend(const Container &Cont) { + return Cont.cend(); +} + +template constexpr auto rbegin(const Container &Cont) { + return Cont.rbegin(); +} + +template constexpr auto rbegin(Container &Cont) { + return Cont.rbegin(); +} + +template constexpr auto rend(const Container &Cont) { + return Cont.rend(); +} + +template constexpr auto rend(Container &Cont) { + return Cont.rend(); +} + +template constexpr auto crbegin(const Container &Cont) { + return Cont.crbegin(); +} + +template constexpr auto crend(const Container &Cont) { + return Cont.crend(); +} +// Find +template< class InputIt, class T > +InputIt find( InputIt first, InputIt last, const T& value ); + +// Reverse +template void reverse(Iter begin, Iter end); + +// Includes +template +bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); + +// IsPermutation +template +bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2); +template +bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, + ForwardIt2 last2); + +// Equal +template +bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2); + +template +bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); + +template +bool equal(InputIt1 first1, InputIt1 last1, + InputIt2 first2, InputIt2 last2, BinaryPred p) { + // Need a definition to suppress undefined_internal_type when invoked with lambda + return true; +} + +template +void iota(ForwardIt first, ForwardIt last, T value); + +} // namespace std + +#endif // USE_RANGES_FAKE_STD_H diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges-pipe.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges-pipe.cpp new file mode 100644 index 00000000000000..f53fb70427e2d6 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges-pipe.cpp @@ -0,0 +1,18 @@ +// RUN: %check_clang_tidy -std=c++20 %s modernize-use-ranges %t -check-suffixes=,PIPE \ +// RUN: -config="{CheckOptions: { \ +// RUN: modernize-use-ranges.UseReversePipe: true }}" -- -I %S/Inputs/use-ranges/ +// RUN: %check_clang_tidy -std=c++20 %s modernize-use-ranges %t -check-suffixes=,NOPIPE -- -I %S/Inputs/use-ranges/ + +// CHECK-FIXES: #include +// CHECK-FIXES: #include + +#include "fake_std.h" + +void stdLib() { + std::vector I; + std::find(I.rbegin(), I.rend(), 0); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm + // CHECK-FIXES-NOPIPE: std::ranges::find(std::ranges::reverse_view(I), 0); + // CHECK-FIXES-PIPE: std::ranges::find(I | std::views::reverse, 0); + +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp index 623af26e3cdc73..e937e1e4e7d3b5 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp @@ -1,116 +1,11 @@ -// RUN: %check_clang_tidy -std=c++20 %s modernize-use-ranges %t -// RUN: %check_clang_tidy -std=c++23 %s modernize-use-ranges %t -check-suffixes=,CPP23 +// RUN: %check_clang_tidy -std=c++20 %s modernize-use-ranges %t -- -- -I %S/Inputs/use-ranges/ +// RUN: %check_clang_tidy -std=c++23 %s modernize-use-ranges %t -check-suffixes=,CPP23 -- -I %S/Inputs/use-ranges/ // CHECK-FIXES: #include // CHECK-FIXES-CPP23: #include // CHECK-FIXES: #include -namespace std { - -template class vector { -public: - using iterator = T *; - using const_iterator = const T *; - using reverse_iterator = T*; - using reverse_const_iterator = const T*; - - constexpr const_iterator begin() const; - constexpr const_iterator end() const; - constexpr const_iterator cbegin() const; - constexpr const_iterator cend() const; - constexpr iterator begin(); - constexpr iterator end(); - constexpr reverse_const_iterator rbegin() const; - constexpr reverse_const_iterator rend() const; - constexpr reverse_const_iterator crbegin() const; - constexpr reverse_const_iterator crend() const; - constexpr reverse_iterator rbegin(); - constexpr reverse_iterator rend(); -}; - -template constexpr auto begin(const Container &Cont) { - return Cont.begin(); -} - -template constexpr auto begin(Container &Cont) { - return Cont.begin(); -} - -template constexpr auto end(const Container &Cont) { - return Cont.end(); -} - -template constexpr auto end(Container &Cont) { - return Cont.end(); -} - -template constexpr auto cbegin(const Container &Cont) { - return Cont.cbegin(); -} - -template constexpr auto cend(const Container &Cont) { - return Cont.cend(); -} - -template constexpr auto rbegin(const Container &Cont) { - return Cont.rbegin(); -} - -template constexpr auto rbegin(Container &Cont) { - return Cont.rbegin(); -} - -template constexpr auto rend(const Container &Cont) { - return Cont.rend(); -} - -template constexpr auto rend(Container &Cont) { - return Cont.rend(); -} - -template constexpr auto crbegin(const Container &Cont) { - return Cont.crbegin(); -} - -template constexpr auto crend(const Container &Cont) { - return Cont.crend(); -} -// Find -template< class InputIt, class T > -InputIt find( InputIt first, InputIt last, const T& value ); - -// Reverse -template void reverse(Iter begin, Iter end); - -// Includes -template -bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); - -// IsPermutation -template -bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2); -template -bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, - ForwardIt2 last2); - -// Equal -template -bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2); - -template -bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2); - -template -bool equal(InputIt1 first1, InputIt1 last1, - InputIt2 first2, InputIt2 last2, BinaryPred p) { - // Need a definition to suppress undefined_internal_type when invoked with lambda - return true; -} - -template -void iota(ForwardIt first, ForwardIt last, T value); - -} // namespace std +#include "fake_std.h" void Positives() { std::vector I, J; @@ -179,15 +74,15 @@ void Reverse(){ std::vector I, J; std::find(I.rbegin(), I.rend(), 0); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm - // CHECK-FIXES: std::ranges::find(std::views::reverse(I), 0); + // CHECK-FIXES: std::ranges::find(std::ranges::reverse_view(I), 0); std::equal(std::rbegin(I), std::rend(I), J.begin(), J.end()); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm - // CHECK-FIXES: std::ranges::equal(std::views::reverse(I), J); + // CHECK-FIXES: std::ranges::equal(std::ranges::reverse_view(I), J); std::equal(I.begin(), I.end(), std::crbegin(J), std::crend(J)); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use a ranges version of this algorithm - // CHECK-FIXES: std::ranges::equal(I, std::views::reverse(J)); + // CHECK-FIXES: std::ranges::equal(I, std::ranges::reverse_view(J)); } void Negatives() { diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp index 53ec8713be3389..6a872824896131 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp @@ -69,7 +69,8 @@ struct PositiveConstValueConstructor { template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType S, T V) { // CHECK-MESSAGES: [[@LINE-1]]:90: warning: the const qualified parameter 'S' - // CHECK-FIXES-NOT: template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType& S, T V) { + // CHECK-MESSAGES: [[@LINE-2]]:95: warning: the parameter 'V' + // CHECK-FIXES: template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType& S, const T& V) { } void instantiated() { diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp new file mode 100644 index 00000000000000..688c79bbaa9ac5 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-templates.cpp @@ -0,0 +1,98 @@ +// RUN: %check_clang_tidy -std=c++14-or-later %s performance-unnecessary-value-param %t + +struct ExpensiveToCopyType { + virtual ~ExpensiveToCopyType(); +}; + +template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType S, T V) { + // CHECK-MESSAGES: [[@LINE-1]]:90: warning: the const qualified parameter 'S' + // CHECK-MESSAGES: [[@LINE-2]]:95: warning: the parameter 'V' + // CHECK-FIXES: template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType& S, const T& V) { +} + +void instantiatedWithExpensiveValue() { + templateWithNonTemplatizedParameter( + ExpensiveToCopyType(), ExpensiveToCopyType()); + templateWithNonTemplatizedParameter(ExpensiveToCopyType(), 5); +} + +template void templateWithNonTemplatizedParameterCheapTemplate(const ExpensiveToCopyType S, T V) { + // CHECK-MESSAGES: [[@LINE-1]]:103: warning: the const qualified parameter 'S' + // CHECK-FIXES: template void templateWithNonTemplatizedParameterCheapTemplate(const ExpensiveToCopyType& S, T V) { +} + +void instantiatedWithCheapValue() { + templateWithNonTemplatizedParameterCheapTemplate(ExpensiveToCopyType(), 5); +} + +template void nonInstantiatedTemplateWithConstValue(const T S) {} +template void nonInstantiatedTemplateWithNonConstValue(T S) {} + +template void instantiatedTemplateSpecialization(T NoSpecS) {} +template <> void instantiatedTemplateSpecialization(int SpecSInt) {} +// Updating template specialization would also require to update the main +// template and other specializations. Such specializations may be +// spreaded across different translation units. +// For that reason we only issue a warning, but do not propose fixes. +template <> +void instantiatedTemplateSpecialization( + ExpensiveToCopyType SpecSExpensiveToCopy) { + // CHECK-MESSAGES: [[@LINE-1]]:25: warning: the parameter 'SpecSExpensiveToCopy' + // CHECK-FIXES-NOT: const T& NoSpecS + // CHECK-FIXES-NOT: const int& SpecSInt + // CHECK-FIXES-NOT: const ExpensiveToCopyType& SpecSExpensiveToCopy +} + +void instantiatedTemplateSpecialization() { + instantiatedTemplateSpecialization(ExpensiveToCopyType()); +} + +template void instantiatedTemplateWithConstValue(const T S) { + // CHECK-MESSAGES: [[@LINE-1]]:71: warning: the const qualified parameter 'S' + // CHECK-FIXES: template void instantiatedTemplateWithConstValue(const T& S) { +} + +void instantiatedConstValue() { + instantiatedTemplateWithConstValue(ExpensiveToCopyType()); +} + +template void instantiatedTemplateWithNonConstValue(T S) { + // CHECK-MESSAGES: [[@LINE-1]]:68: warning: the parameter 'S' + // CHECK-FIXES: template void instantiatedTemplateWithNonConstValue(const T& S) { +} + +void instantiatedNonConstValue() { + instantiatedTemplateWithNonConstValue(ExpensiveToCopyType()); +} + +void lambdaConstValue() { + auto fn = [](const ExpensiveToCopyType S) { + // CHECK-MESSAGES: [[@LINE-1]]:42: warning: the const qualified parameter 'S' + // CHECK-FIXES: auto fn = [](const ExpensiveToCopyType& S) { + }; + fn(ExpensiveToCopyType()); +} + +void lambdaNonConstValue() { + auto fn = [](ExpensiveToCopyType S) { + // CHECK-MESSAGES: [[@LINE-1]]:36: warning: the parameter 'S' + // CHECK-FIXES: auto fn = [](const ExpensiveToCopyType& S) { + }; + fn(ExpensiveToCopyType()); +} + +void lambdaConstAutoValue() { + auto fn = [](const auto S) { + // CHECK-MESSAGES: [[@LINE-1]]:27: warning: the const qualified parameter 'S' + // CHECK-FIXES: auto fn = [](const auto& S) { + }; + fn(ExpensiveToCopyType()); +} + +void lambdaNonConstAutoValue() { + auto fn = [](auto S) { + // CHECK-MESSAGES: [[@LINE-1]]:21: warning: the parameter 'S' + // CHECK-FIXES: auto fn = [](const auto& S) { + }; + fn(ExpensiveToCopyType()); +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp index d578eedd94a390..0dffaefa213a45 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp @@ -107,19 +107,6 @@ struct PositiveConstValueConstructor { // CHECK-FIXES: PositiveConstValueConstructor(const ExpensiveToCopyType& ConstCopy) {} }; -template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType S, T V) { - // CHECK-MESSAGES: [[@LINE-1]]:90: warning: the const qualified parameter 'S' - // CHECK-FIXES-NOT: template void templateWithNonTemplatizedParameter(const ExpensiveToCopyType& S, T V) { -} - -void instantiated() { - templateWithNonTemplatizedParameter(ExpensiveToCopyType(), ExpensiveToCopyType()); - templateWithNonTemplatizedParameter(ExpensiveToCopyType(), 5); -} - -template void negativeTemplateType(const T V) { -} - void negativeArray(const ExpensiveToCopyType[]) { } @@ -370,35 +357,3 @@ void fun() { ExpensiveToCopyType E; NegativeUsingConstructor S(E); } - -template -void templateFunction(T) { -} - -template<> -void templateFunction(ExpensiveToCopyType E) { - // CHECK-MESSAGES: [[@LINE-1]]:64: warning: the parameter 'E' is copied - // CHECK-FIXES: void templateFunction(ExpensiveToCopyType E) { - E.constReference(); -} - -template -T templateSpecializationFunction(ExpensiveToCopyType E) { - // CHECK-MESSAGES: [[@LINE-1]]:54: warning: the parameter 'E' is copied - // CHECK-FIXES-NOT: T templateSpecializationFunction(const ExpensiveToCopyType& E) { - return T(); -} - -template <> -bool templateSpecializationFunction(ExpensiveToCopyType E) { - // CHECK-MESSAGES: [[@LINE-1]]:57: warning: the parameter 'E' is copied - // CHECK-FIXES-NOT: bool templateSpecializationFunction(const ExpensiveToCopyType& E) { - return true; -} - -template <> -int templateSpecializationFunction(ExpensiveToCopyType E) { - // CHECK-MESSAGES: [[@LINE-1]]:56: warning: the parameter 'E' is copied - // CHECK-FIXES-NOT: int templateSpecializationFunction(const ExpensiveToCopyType& E) { - return 0; -} diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 1d3ab891904076..be024da5e005c8 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -1820,6 +1820,18 @@ def availability(self): return AvailabilityKind.from_id(self._availability) + @property + def binary_operator(self): + """ + Retrieves the opcode if this cursor points to a binary operator + :return: + """ + + if not hasattr(self, "_binopcode"): + self._binopcode = conf.lib.clang_Cursor_getBinaryOpcode(self) + + return BinaryOperator.from_id(self._binopcode) + @property def access_specifier(self): """ @@ -2110,6 +2122,55 @@ def from_cursor_result(res, fn, args): return res +class BinaryOperator(BaseEnumeration): + """ + Describes the BinaryOperator of a declaration + """ + + def __nonzero__(self): + """Allows checks of the kind ```if cursor.binary_operator:```""" + return self.value != 0 + + @property + def is_assignment(self): + return BinaryOperator.Assign.value <= self.value < BinaryOperator.Comma.value + + Invalid = 0 + PtrMemD = 1 + PtrMemI = 2 + Mul = 3 + Div = 4 + Rem = 5 + Add = 6 + Sub = 7 + Shl = 8 + Shr = 9 + Cmp = 10 + LT = 11 + GT = 12 + LE = 13 + GE = 14 + EQ = 15 + NE = 16 + And = 17 + Xor = 18 + Or = 19 + LAnd = 20 + LOr = 21 + Assign = 22 + MulAssign = 23 + DivAssign = 24 + RemAssign = 25 + AddAssign = 26 + SubAssign = 27 + ShlAssign = 28 + ShrAssign = 29 + AndAssign = 30 + XorAssign = 31 + OrAssign = 32 + Comma = 33 + + class StorageClass(BaseEnumeration): """ Describes the storage class of a declaration @@ -3847,6 +3908,7 @@ def write_main_file_to_stdout(self): ("clang_Cursor_getTemplateArgumentUnsignedValue", [Cursor, c_uint], c_ulonglong), ("clang_Cursor_isAnonymous", [Cursor], bool), ("clang_Cursor_isBitField", [Cursor], bool), + ("clang_Cursor_getBinaryOpcode", [Cursor], c_int), ("clang_Cursor_getBriefCommentText", [Cursor], _CXString, _CXString.from_result), ("clang_Cursor_getRawCommentText", [Cursor], _CXString, _CXString.from_result), ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong), @@ -4016,6 +4078,7 @@ def function_exists(self, name): __all__ = [ "AvailabilityKind", + "BinaryOperator", "Config", "CodeCompletionResults", "CompilationDatabase", diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index 84cd8139418447..7476947bde2ea6 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -13,6 +13,7 @@ from clang.cindex import TemplateArgumentKind from clang.cindex import TranslationUnit from clang.cindex import TypeKind +from clang.cindex import BinaryOperator from .util import get_cursor from .util import get_cursors from .util import get_tu @@ -54,6 +55,64 @@ class C { void foo<-7, float, true>(); """ +kBinops = """\ +struct C { + int m; + }; + + void func(void){ + int a, b; + int C::* p = &C:: + + C c; + c.*p; + + C* pc; + pc->*p; + + a * b; + a / b; + a % b; + a + b; + a - b; + + a << b; + a >> b; + + a < b; + a > b; + + a <= b; + a >= b; + a == b; + a != b; + + a & b; + a ^ b; + a | b; + + a && b; + a || b; + + a = b; + + a *= b; + a /= b; + a %= b; + a += b; + a -= b; + + a <<= b; + a >>= b; + + a &= b; + a ^= b; + a |= b; + a , b; + + } + """ + class TestCursor(unittest.TestCase): def test_get_children(self): @@ -695,3 +754,48 @@ def test_mangled_name(self): self.assertIn( foo.mangled_name, ("_Z3fooii", "__Z3fooii", "?foo@@YAHHH", "?foo@@YAHHH@Z") ) + + def test_binop(self): + tu = get_tu(kBinops, lang="cpp") + + operators = { + # not exposed yet + # ".*" : BinaryOperator.PtrMemD, + "->*": BinaryOperator.PtrMemI, + "*": BinaryOperator.Mul, + "/": BinaryOperator.Div, + "%": BinaryOperator.Rem, + "+": BinaryOperator.Add, + "-": BinaryOperator.Sub, + "<<": BinaryOperator.Shl, + ">>": BinaryOperator.Shr, + # tests do not run in C++2a mode so this operator is not available + # "<=>" : BinaryOperator.Cmp, + "<": BinaryOperator.LT, + ">": BinaryOperator.GT, + "<=": BinaryOperator.LE, + ">=": BinaryOperator.GE, + "==": BinaryOperator.EQ, + "!=": BinaryOperator.NE, + "&": BinaryOperator.And, + "^": BinaryOperator.Xor, + "|": BinaryOperator.Or, + "&&": BinaryOperator.LAnd, + "||": BinaryOperator.LOr, + "=": BinaryOperator.Assign, + "*=": BinaryOperator.MulAssign, + "/=": BinaryOperator.DivAssign, + "%=": BinaryOperator.RemAssign, + "+=": BinaryOperator.AddAssign, + "-=": BinaryOperator.SubAssign, + "<<=": BinaryOperator.ShlAssign, + ">>=": BinaryOperator.ShrAssign, + "&=": BinaryOperator.AndAssign, + "^=": BinaryOperator.XorAssign, + "|=": BinaryOperator.OrAssign, + ",": BinaryOperator.Comma, + } + + for op, typ in operators.items(): + c = get_cursor(tu, op) + assert c.binary_operator == typ diff --git a/clang/bindings/python/tests/cindex/test_enums.py b/clang/bindings/python/tests/cindex/test_enums.py index d75052954820c8..63b2292c5d9bdc 100644 --- a/clang/bindings/python/tests/cindex/test_enums.py +++ b/clang/bindings/python/tests/cindex/test_enums.py @@ -12,6 +12,7 @@ LinkageKind, TLSKind, StorageClass, + BinaryOperator, ) @@ -28,6 +29,7 @@ class TestEnums(unittest.TestCase): LinkageKind, TLSKind, StorageClass, + BinaryOperator, ] def test_from_id(self): diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index b8c9db49863c60..04efd0683dbc4b 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -305,7 +305,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi) set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Generic CACHE STRING "") set(BUILTINS_${target}_CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "") set(BUILTINS_${target}_CMAKE_SYSROOT "" CACHE STRING "") - set(BUILTINS_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "") + set(BUILTINS_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "") foreach(lang C;CXX;ASM) set(BUILTINS_${target}_CMAKE_${lang}_local_flags "--target=${target} -mthumb") if(${target} STREQUAL "armv8m.main-unknown-eabi") @@ -322,10 +322,12 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi) set(RUNTIMES_${target}_CMAKE_SYSTEM_NAME Generic CACHE STRING "") set(RUNTIMES_${target}_CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "") set(RUNTIMES_${target}_CMAKE_SYSROOT "" CACHE STRING "") - set(RUNTIMES_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "") + set(RUNTIMES_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "") set(RUNTIMES_${target}_CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "") foreach(lang C;CXX;ASM) - set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb -Wno-atomic-alignment" CACHE STRING "") + # TODO: The preprocessor defines workaround various issues in libc and libc++ integration. + # These should be addressed and removed over time. + set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb -Wno-atomic-alignment -D'vfprintf(stream, format, vlist)=vprintf(format, vlist)' -D'fprintf(stream, format, ...)=printf(format)' -D'timeval=struct timeval{int tv_sec; int tv_usec;}' -D'gettimeofday(tv, tz)' -D_LIBCPP_PRINT=1" CACHE STRING "") endforeach() foreach(type SHARED;MODULE;EXE) set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "") @@ -335,7 +337,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi) set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") - set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC OFF CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") @@ -357,7 +359,7 @@ foreach(target riscv32-unknown-elf) set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Generic CACHE STRING "") set(BUILTINS_${target}_CMAKE_SYSTEM_PROCESSOR RISCV CACHE STRING "") set(BUILTINS_${target}_CMAKE_SYSROOT "" CACHE STRING "") - set(BUILTINS_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "") + set(BUILTINS_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "") foreach(lang C;CXX;ASM) set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f" CACHE STRING "") endforeach() @@ -370,10 +372,12 @@ foreach(target riscv32-unknown-elf) set(RUNTIMES_${target}_CMAKE_SYSTEM_NAME Generic CACHE STRING "") set(RUNTIMES_${target}_CMAKE_SYSTEM_PROCESSOR RISCV CACHE STRING "") set(RUNTIMES_${target}_CMAKE_SYSROOT "" CACHE STRING "") - set(RUNTIMES_${target}_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "") + set(RUNTIMES_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "") set(RUNTIMES_${target}_CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "") foreach(lang C;CXX;ASM) - set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f" CACHE STRING "") + # TODO: The preprocessor defines workaround various issues in libc and libc++ integration. + # These should be addressed and removed over time. + set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f -D'vfprintf(stream, format, vlist)=vprintf(format, vlist)' -D'fprintf(stream, format, ...)=printf(format)' -D'timeval=struct timeval{int tv_sec; int tv_usec;}' -D'gettimeofday(tv, tz)' -D_LIBCPP_PRINT=1" CACHE STRING "") endforeach() foreach(type SHARED;MODULE;EXE) set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "") @@ -383,7 +387,7 @@ foreach(target riscv32-unknown-elf) set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "") set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") - set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC OFF CACHE BOOL "") + set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") set(RUNTIMES_${target}_LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst index a348f3640c5eb8..29154292dc7a5e 100644 --- a/clang/docs/CommandGuide/clang.rst +++ b/clang/docs/CommandGuide/clang.rst @@ -400,6 +400,14 @@ number of cross compilers, or may only support a native target. option is only supported on AArch64 and RISC-V. On RISC-V, this option also prints out the ISA string of enabled extensions. +.. option:: --print-supported-extensions + + Prints the list of all extensions that are supported for every CPU target + for an architecture (specified through ``--target=`` or + :option:`-arch` ````). If no target is specified, the system + default target will be used. Currently, this option is only supported on + AArch64 and RISC-V. + Code Generation Options ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5dc0f8b7e0bbb8..d0138d6b00017a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -40,8 +40,6 @@ code bases. - Setting the deprecated CMake variable ``GCC_INSTALL_PREFIX`` (which sets the default ``--gcc-toolchain=``) now leads to a fatal error. -- The ``le32`` and ``le64`` targets have been removed. - C/C++ Language Potentially Breaking Changes ------------------------------------------- @@ -313,10 +311,6 @@ Resolutions to C++ Defect Reports - Clang now considers ``noexcept(typeid(expr))`` more carefully, instead of always assuming that ``std::bad_typeid`` can be thrown. (`CWG2191: Incorrect result for noexcept(typeid(v)) `_). -- Clang now correctly implements lookup for the terminal name of a member-qualified nested-name-specifier. - (`CWG1835: Dependent member lookup before < `_). - The warning can be disabled via `-Wno-missing-dependent-template-keyword`. - C Language Changes ------------------ @@ -830,6 +824,8 @@ Bug Fixes in This Version - ``__is_trivially_equality_comparable`` no longer returns true for types which have a constrained defaulted comparison operator (#GH89293). +- Fixed Clang from generating dangling StringRefs when deserializing Exprs & Stmts (#GH98667) + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1021,7 +1017,6 @@ Bug Fixes to C++ Support (#GH88081), (#GH89496), (#GH90669), (#GH91633) and (#GH97453). - Fixed a crash in constraint instantiation under nested lambdas with dependent parameters. - Fixed handling of brace ellison when building deduction guides. (#GH64625), (#GH83368). -- Clang now instantiates local constexpr functions eagerly for constant evaluators. (#GH35052), (#GH94849) - Fixed a failed assertion when attempting to convert an integer representing the difference between the addresses of two labels (a GNU extension) to a pointer within a constant expression. (#GH95366). - Fix immediate escalation bugs in the presence of dependent call arguments. (#GH94935) @@ -1041,6 +1036,9 @@ Bug Fixes to C++ Support (#GH48937) - Fix a crash when parsing an invalid type-requirement in a requires expression. (#GH51868) - Fix parsing of built-in type-traits such as ``__is_pointer`` in libstdc++ headers. (#GH95598) +- Fixed failed assertion when resolving context of defaulted comparison method outside of struct. (#GH96043). +- Clang now diagnoses explicit object parameters in member pointers and other contexts where they should not appear. + Fixes (#GH85992). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1078,6 +1076,25 @@ X86 Support ^^^^^^^^^^^ - Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1 +- Support has been removed for the AMD "3DNow!" instruction-set. + Neither modern AMD CPUs, nor any Intel CPUs implement these + instructions, and they were never widely used. + + * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used. + * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler. + * The header ```` is deprecated, and emits a warning if included. + * The 3dNow intrinsic functions have been removed: ``_m_femms``, + ``_m_pavgusb``, ``_m_pf2id``, ``_m_pfacc``, ``_m_pfadd``, + ``_m_pfcmpeq``, ``_m_pfcmpge``, ``_m_pfcmpgt``, ``_m_pfmax``, + ``_m_pfmin``, ``_m_pfmul``, ``_m_pfrcp``, ``_m_pfrcpit1``, + ``_m_pfrcpit2``, ``_m_pfrsqrt``, ``_m_pfrsqrtit1``, ``_m_pfsub``, + ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``, + ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``, + ``_m_pswapdsi``. + * The compiler builtins corresponding to each of the above + intrinsics have also been removed (``__builtin_ia32_femms``, and so on). + * "3DNow!" instructions remain supported in assembly code, including + inside inline-assembly. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ @@ -1151,6 +1168,10 @@ RISC-V Support - ``__attribute__((rvv_vector_bits(N)))`` is now supported for RVV vbool*_t types. - Profile names in ``-march`` option are now supported. - Passing empty structs/unions as arguments in C++ is now handled correctly. The behavior is similar to GCC's. +- ``-m[no-]scalar-strict-align`` and ``-m[no-]vector-strict-align`` options have + been added to give separate control of whether scalar or vector misaligned + accesses may be created. ``-m[no-]strict-align`` applies to both scalar and + vector. CUDA/HIP Language Changes ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1188,6 +1209,8 @@ DWARF Support in Clang Floating Point Support in Clang ------------------------------- +- Add ``__builtin__fmaf16`` builtin for floating point types. + Fixed Point Support in Clang ---------------------------- @@ -1306,6 +1329,8 @@ Python Binding Changes - Exposed `CXRewriter` API as `class Rewriter`. - Add some missing kinds from Index.h (CursorKind: 149-156, 272-320, 420-437. TemplateArgumentKind: 5-9. TypeKind: 161-175 and 178). +- Add support for retrieving binary operator information through + Cursor.binary_operator(). OpenMP Support -------------- diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt index 62871133a68075..a8ee8f1fcb87c2 100644 --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -362,6 +362,7 @@ clang/lib/Basic/Targets/BPF.cpp clang/lib/Basic/Targets/BPF.h clang/lib/Basic/Targets/Hexagon.h clang/lib/Basic/Targets/Lanai.h +clang/lib/Basic/Targets/Le64.h clang/lib/Basic/Targets/M68k.h clang/lib/Basic/Targets/MSP430.h clang/lib/Basic/Targets/NVPTX.cpp diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index ce2282937f86cb..24ed23a6287286 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -3750,6 +3750,59 @@ enum CX_StorageClass { CX_SC_Register }; +/** + * Represents a specific kind of binary operator which can appear at a cursor. + */ +enum CX_BinaryOperatorKind { + CX_BO_Invalid = 0, + CX_BO_PtrMemD = 1, + CX_BO_PtrMemI = 2, + CX_BO_Mul = 3, + CX_BO_Div = 4, + CX_BO_Rem = 5, + CX_BO_Add = 6, + CX_BO_Sub = 7, + CX_BO_Shl = 8, + CX_BO_Shr = 9, + CX_BO_Cmp = 10, + CX_BO_LT = 11, + CX_BO_GT = 12, + CX_BO_LE = 13, + CX_BO_GE = 14, + CX_BO_EQ = 15, + CX_BO_NE = 16, + CX_BO_And = 17, + CX_BO_Xor = 18, + CX_BO_Or = 19, + CX_BO_LAnd = 20, + CX_BO_LOr = 21, + CX_BO_Assign = 22, + CX_BO_MulAssign = 23, + CX_BO_DivAssign = 24, + CX_BO_RemAssign = 25, + CX_BO_AddAssign = 26, + CX_BO_SubAssign = 27, + CX_BO_ShlAssign = 28, + CX_BO_ShrAssign = 29, + CX_BO_AndAssign = 30, + CX_BO_XorAssign = 31, + CX_BO_OrAssign = 32, + CX_BO_Comma = 33, + CX_BO_LAST = CX_BO_Comma +}; + +/** + * \brief Returns the operator code for the binary operator. + */ +CINDEX_LINKAGE enum CX_BinaryOperatorKind +clang_Cursor_getBinaryOpcode(CXCursor C); + +/** + * \brief Returns a string containing the spelling of the binary operator. + */ +CINDEX_LINKAGE CXString +clang_Cursor_getBinaryOpcodeStr(enum CX_BinaryOperatorKind Op); + /** * Returns the storage class for a function or variable declaration. * diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h index daf2f1897f46bd..b389aa8d56f167 100644 --- a/clang/include/clang/APINotes/Types.h +++ b/clang/include/clang/APINotes/Types.h @@ -263,13 +263,6 @@ class ContextInfo : public CommonTypeInfo { SwiftObjCMembers = Value.value_or(false); } - /// Strip off any information within the class information structure that is - /// module-local, such as 'audited' flags. - void stripModuleLocalInfo() { - HasDefaultNullability = false; - DefaultNullability = 0; - } - friend bool operator==(const ContextInfo &, const ContextInfo &); ContextInfo &operator|=(const ContextInfo &RHS) { diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 5957f14098363e..561a9d872acfb0 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -542,12 +542,9 @@ class LabelDecl : public NamedDecl { }; /// Represent a C++ namespace. -class NamespaceDecl : public NamedDecl, public DeclContext, - public Redeclarable -{ - - enum Flags : unsigned { F_Inline = 1 << 0, F_Nested = 1 << 1 }; - +class NamespaceDecl : public NamedDecl, + public DeclContext, + public Redeclarable { /// The starting location of the source range, pointing /// to either the namespace or the inline keyword. SourceLocation LocStart; @@ -555,12 +552,8 @@ class NamespaceDecl : public NamedDecl, public DeclContext, /// The ending location of the source range. SourceLocation RBraceLoc; - /// A pointer to either the anonymous namespace that lives just inside - /// this namespace or to the first namespace in the chain (the latter case - /// only when this is not the first in the chain), along with a - /// boolean value indicating whether this is an inline namespace. - llvm::PointerIntPair - AnonOrFirstNamespaceAndFlags; + /// The unnamed namespace that inhabits this namespace, if any. + NamespaceDecl *AnonymousNamespace = nullptr; NamespaceDecl(ASTContext &C, DeclContext *DC, bool Inline, SourceLocation StartLoc, SourceLocation IdLoc, @@ -607,35 +600,19 @@ class NamespaceDecl : public NamedDecl, public DeclContext, } /// Returns true if this is an inline namespace declaration. - bool isInline() const { - return AnonOrFirstNamespaceAndFlags.getInt() & F_Inline; - } + bool isInline() const { return NamespaceDeclBits.IsInline; } /// Set whether this is an inline namespace declaration. - void setInline(bool Inline) { - unsigned F = AnonOrFirstNamespaceAndFlags.getInt(); - if (Inline) - AnonOrFirstNamespaceAndFlags.setInt(F | F_Inline); - else - AnonOrFirstNamespaceAndFlags.setInt(F & ~F_Inline); - } + void setInline(bool Inline) { NamespaceDeclBits.IsInline = Inline; } /// Returns true if this is a nested namespace declaration. /// \code /// namespace outer::nested { } /// \endcode - bool isNested() const { - return AnonOrFirstNamespaceAndFlags.getInt() & F_Nested; - } + bool isNested() const { return NamespaceDeclBits.IsNested; } /// Set whether this is a nested namespace declaration. - void setNested(bool Nested) { - unsigned F = AnonOrFirstNamespaceAndFlags.getInt(); - if (Nested) - AnonOrFirstNamespaceAndFlags.setInt(F | F_Nested); - else - AnonOrFirstNamespaceAndFlags.setInt(F & ~F_Nested); - } + void setNested(bool Nested) { NamespaceDeclBits.IsNested = Nested; } /// Returns true if the inline qualifier for \c Name is redundant. bool isRedundantInlineQualifierFor(DeclarationName Name) const { @@ -649,34 +626,18 @@ class NamespaceDecl : public NamedDecl, public DeclContext, std::distance(Y.begin(), Y.end()); } - /// Get the original (first) namespace declaration. - NamespaceDecl *getOriginalNamespace(); - - /// Get the original (first) namespace declaration. - const NamespaceDecl *getOriginalNamespace() const; - - /// Return true if this declaration is an original (first) declaration - /// of the namespace. This is false for non-original (subsequent) namespace - /// declarations and anonymous namespaces. - bool isOriginalNamespace() const; - - /// Retrieve the anonymous namespace nested inside this namespace, - /// if any. + /// Retrieve the anonymous namespace that inhabits this namespace, if any. NamespaceDecl *getAnonymousNamespace() const { - return getOriginalNamespace()->AnonOrFirstNamespaceAndFlags.getPointer(); + return getFirstDecl()->AnonymousNamespace; } void setAnonymousNamespace(NamespaceDecl *D) { - getOriginalNamespace()->AnonOrFirstNamespaceAndFlags.setPointer(D); + getFirstDecl()->AnonymousNamespace = D; } /// Retrieves the canonical declaration of this namespace. - NamespaceDecl *getCanonicalDecl() override { - return getOriginalNamespace(); - } - const NamespaceDecl *getCanonicalDecl() const { - return getOriginalNamespace(); - } + NamespaceDecl *getCanonicalDecl() override { return getFirstDecl(); } + const NamespaceDecl *getCanonicalDecl() const { return getFirstDecl(); } SourceRange getSourceRange() const override LLVM_READONLY { return SourceRange(LocStart, RBraceLoc); diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 6c711cfe7927b2..40f01abf384e92 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -1487,6 +1487,27 @@ class DeclContext { /// Number of bits in DeclContextBitfields. enum { NumDeclContextBits = 13 }; + /// Stores the bits used by NamespaceDecl. + /// If modified NumNamespaceDeclBits and the accessor + /// methods in NamespaceDecl should be updated appropriately. + class NamespaceDeclBitfields { + friend class NamespaceDecl; + /// For the bits in DeclContextBitfields + LLVM_PREFERRED_TYPE(DeclContextBitfields) + uint64_t : NumDeclContextBits; + + /// True if this is an inline namespace. + LLVM_PREFERRED_TYPE(bool) + uint64_t IsInline : 1; + + /// True if this is a nested-namespace-definition. + LLVM_PREFERRED_TYPE(bool) + uint64_t IsNested : 1; + }; + + /// Number of inherited and non-inherited bits in NamespaceDeclBitfields. + enum { NumNamespaceDeclBits = NumDeclContextBits + 2 }; + /// Stores the bits used by TagDecl. /// If modified NumTagDeclBits and the accessor /// methods in TagDecl should be updated appropriately. @@ -1985,6 +2006,7 @@ class DeclContext { /// 8 bytes with static_asserts in the ctor of DeclContext. union { DeclContextBitfields DeclContextBits; + NamespaceDeclBitfields NamespaceDeclBits; TagDeclBitfields TagDeclBits; EnumDeclBitfields EnumDeclBits; RecordDeclBitfields RecordDeclBits; @@ -1998,6 +2020,8 @@ class DeclContext { static_assert(sizeof(DeclContextBitfields) <= 8, "DeclContextBitfields is larger than 8 bytes!"); + static_assert(sizeof(NamespaceDeclBitfields) <= 8, + "NamespaceDeclBitfields is larger than 8 bytes!"); static_assert(sizeof(TagDeclBitfields) <= 8, "TagDeclBitfields is larger than 8 bytes!"); static_assert(sizeof(EnumDeclBitfields) <= 8, diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index edaea6fe27cc3d..c2feac525c1ea6 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -3676,9 +3676,9 @@ class CXXUnresolvedConstructExpr final /// an implicit access if a qualifier is provided. class CXXDependentScopeMemberExpr final : public Expr, - private llvm::TrailingObjects< - CXXDependentScopeMemberExpr, NestedNameSpecifierLoc, DeclAccessPair, - ASTTemplateKWAndArgsInfo, TemplateArgumentLoc> { + private llvm::TrailingObjects { friend class ASTStmtReader; friend class ASTStmtWriter; friend TrailingObjects; @@ -3691,15 +3691,17 @@ class CXXDependentScopeMemberExpr final /// implicit accesses. QualType BaseType; + /// The nested-name-specifier that precedes the member name, if any. + /// FIXME: This could be in principle store as a trailing object. + /// However the performance impact of doing so should be investigated first. + NestedNameSpecifierLoc QualifierLoc; + /// The member to which this member expression refers, which /// can be name, overloaded operator, or destructor. /// /// FIXME: could also be a template-id DeclarationNameInfo MemberNameInfo; - /// The location of the '->' or '.' operator. - SourceLocation OperatorLoc; - // CXXDependentScopeMemberExpr is followed by several trailing objects, // some of which optional. They are in order: // @@ -3719,16 +3721,8 @@ class CXXDependentScopeMemberExpr final return CXXDependentScopeMemberExprBits.HasTemplateKWAndArgsInfo; } - unsigned getNumUnqualifiedLookups() const { - return CXXDependentScopeMemberExprBits.NumUnqualifiedLookups; - } - - unsigned numTrailingObjects(OverloadToken) const { - return hasQualifier(); - } - - unsigned numTrailingObjects(OverloadToken) const { - return getNumUnqualifiedLookups(); + bool hasFirstQualifierFoundInScope() const { + return CXXDependentScopeMemberExprBits.HasFirstQualifierFoundInScope; } unsigned numTrailingObjects(OverloadToken) const { @@ -3739,32 +3733,33 @@ class CXXDependentScopeMemberExpr final return getNumTemplateArgs(); } + unsigned numTrailingObjects(OverloadToken) const { + return hasFirstQualifierFoundInScope(); + } + CXXDependentScopeMemberExpr(const ASTContext &Ctx, Expr *Base, QualType BaseType, bool IsArrow, SourceLocation OperatorLoc, NestedNameSpecifierLoc QualifierLoc, SourceLocation TemplateKWLoc, - ArrayRef UnqualifiedLookups, + NamedDecl *FirstQualifierFoundInScope, DeclarationNameInfo MemberNameInfo, const TemplateArgumentListInfo *TemplateArgs); - CXXDependentScopeMemberExpr(EmptyShell Empty, bool HasQualifier, - unsigned NumUnqualifiedLookups, - bool HasTemplateKWAndArgsInfo); + CXXDependentScopeMemberExpr(EmptyShell Empty, bool HasTemplateKWAndArgsInfo, + bool HasFirstQualifierFoundInScope); public: static CXXDependentScopeMemberExpr * Create(const ASTContext &Ctx, Expr *Base, QualType BaseType, bool IsArrow, SourceLocation OperatorLoc, NestedNameSpecifierLoc QualifierLoc, - SourceLocation TemplateKWLoc, - ArrayRef UnqualifiedLookups, + SourceLocation TemplateKWLoc, NamedDecl *FirstQualifierFoundInScope, DeclarationNameInfo MemberNameInfo, const TemplateArgumentListInfo *TemplateArgs); static CXXDependentScopeMemberExpr * - CreateEmpty(const ASTContext &Ctx, bool HasQualifier, - unsigned NumUnqualifiedLookups, bool HasTemplateKWAndArgsInfo, - unsigned NumTemplateArgs); + CreateEmpty(const ASTContext &Ctx, bool HasTemplateKWAndArgsInfo, + unsigned NumTemplateArgs, bool HasFirstQualifierFoundInScope); /// True if this is an implicit access, i.e. one in which the /// member being accessed was not written in the source. The source @@ -3789,35 +3784,34 @@ class CXXDependentScopeMemberExpr final bool isArrow() const { return CXXDependentScopeMemberExprBits.IsArrow; } /// Retrieve the location of the '->' or '.' operator. - SourceLocation getOperatorLoc() const { return OperatorLoc; } - - /// Determines whether this member expression had a nested-name-specifier - /// prior to the name of the member, e.g., x->Base::foo. - bool hasQualifier() const { - return CXXDependentScopeMemberExprBits.HasQualifier; - } - - /// If the member name was qualified, retrieves the nested-name-specifier - /// that precedes the member name, with source-location information. - NestedNameSpecifierLoc getQualifierLoc() const { - if (!hasQualifier()) - return NestedNameSpecifierLoc(); - return *getTrailingObjects(); + SourceLocation getOperatorLoc() const { + return CXXDependentScopeMemberExprBits.OperatorLoc; } - /// If the member name was qualified, retrieves the - /// nested-name-specifier that precedes the member name. Otherwise, returns - /// NULL. + /// Retrieve the nested-name-specifier that qualifies the member name. NestedNameSpecifier *getQualifier() const { - return getQualifierLoc().getNestedNameSpecifier(); + return QualifierLoc.getNestedNameSpecifier(); } - /// Retrieve the declarations found by unqualified lookup for the first - /// component name of the nested-name-specifier, if any. - ArrayRef unqualified_lookups() const { - if (!getNumUnqualifiedLookups()) - return std::nullopt; - return {getTrailingObjects(), getNumUnqualifiedLookups()}; + /// Retrieve the nested-name-specifier that qualifies the member + /// name, with source location information. + NestedNameSpecifierLoc getQualifierLoc() const { return QualifierLoc; } + + /// Retrieve the first part of the nested-name-specifier that was + /// found in the scope of the member access expression when the member access + /// was initially parsed. + /// + /// This function only returns a useful result when member access expression + /// uses a qualified member name, e.g., "x.Base::f". Here, the declaration + /// returned by this function describes what was found by unqualified name + /// lookup for the identifier "Base" within the scope of the member access + /// expression itself. At template instantiation time, this information is + /// combined with the results of name lookup into the type of the object + /// expression itself (the class type of x). + NamedDecl *getFirstQualifierFoundInScope() const { + if (!hasFirstQualifierFoundInScope()) + return nullptr; + return *getTrailingObjects(); } /// Retrieve the name of the member that this expression refers to. diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index 257a61c97c9c6d..9cd7a364cd3f1d 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -1020,19 +1020,18 @@ class alignas(void *) Stmt { LLVM_PREFERRED_TYPE(bool) unsigned IsArrow : 1; - /// True if this member expression used a nested-name-specifier to - /// refer to the member, e.g., "x->Base::f". - LLVM_PREFERRED_TYPE(bool) - unsigned HasQualifier : 1; - /// Whether this member expression has info for explicit template /// keyword and arguments. LLVM_PREFERRED_TYPE(bool) unsigned HasTemplateKWAndArgsInfo : 1; - /// Number of declarations found by unqualified lookup for the - /// first component name of the nested-name-specifier. - unsigned NumUnqualifiedLookups; + /// See getFirstQualifierFoundInScope() and the comment listing + /// the trailing objects. + LLVM_PREFERRED_TYPE(bool) + unsigned HasFirstQualifierFoundInScope : 1; + + /// The location of the '->' or '.' operator. + SourceLocation OperatorLoc; }; class OverloadExprBitfields { diff --git a/clang/include/clang/AST/UnresolvedSet.h b/clang/include/clang/AST/UnresolvedSet.h index ef44499ce59264..1369725ab4e96a 100644 --- a/clang/include/clang/AST/UnresolvedSet.h +++ b/clang/include/clang/AST/UnresolvedSet.h @@ -97,10 +97,6 @@ class UnresolvedSetImpl { decls().push_back(DeclAccessPair::make(D, AS)); } - void addAllDecls(ArrayRef Other) { - append(iterator(Other.begin()), iterator(Other.end())); - } - /// Replaces the given declaration with the new one, once. /// /// \return true if the set changed diff --git a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h index 925b99af9141a3..f9c923a36ad229 100644 --- a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h +++ b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h @@ -113,7 +113,11 @@ class AnalysisASTVisitor : public RecursiveASTVisitor { // nevertheless it appears in the Clang CFG, so we don't exclude it here. bool TraverseDecltypeTypeLoc(DecltypeTypeLoc) { return true; } bool TraverseTypeOfExprTypeLoc(TypeOfExprTypeLoc) { return true; } - bool TraverseCXXTypeidExpr(CXXTypeidExpr *) { return true; } + bool TraverseCXXTypeidExpr(CXXTypeidExpr *TIE) { + if (TIE->isPotentiallyEvaluated()) + return RecursiveASTVisitor::TraverseCXXTypeidExpr(TIE); + return true; + } bool TraverseUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *) { return true; } diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 7074479786b973..a85e7918f4d7e0 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -37,36 +37,6 @@ TARGET_BUILTIN(__builtin_ia32_undef512, "V8d", "ncV:512:", "") TARGET_BUILTIN(__builtin_ia32_readeflags_u32, "Ui", "n", "") TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "") -// 3DNow! -// -TARGET_BUILTIN(__builtin_ia32_femms, "v", "n", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pavgusb, "V8cV8cV8c", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pf2id, "V2iV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfacc, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfadd, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfcmpeq, "V2iV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfcmpge, "V2iV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfcmpgt, "V2iV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfmax, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfmin, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfmul, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfrcp, "V2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfrcpit1, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfrcpit2, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfrsqrt, "V2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfrsqit1, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfsub, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pfsubr, "V2fV2fV2f", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pi2fd, "V2fV2i", "ncV:64:", "3dnow") -TARGET_BUILTIN(__builtin_ia32_pmulhrw, "V4sV4sV4s", "ncV:64:", "3dnow") -// 3DNow! Extensions (3dnowa). -TARGET_BUILTIN(__builtin_ia32_pf2iw, "V2iV2f", "ncV:64:", "3dnowa") -TARGET_BUILTIN(__builtin_ia32_pfnacc, "V2fV2fV2f", "ncV:64:", "3dnowa") -TARGET_BUILTIN(__builtin_ia32_pfpnacc, "V2fV2fV2f", "ncV:64:", "3dnowa") -TARGET_BUILTIN(__builtin_ia32_pi2fw, "V2fV2i", "ncV:64:", "3dnowa") -TARGET_BUILTIN(__builtin_ia32_pswapdsf, "V2fV2f", "ncV:64:", "3dnowa") -TARGET_BUILTIN(__builtin_ia32_pswapdsi, "V2iV2i", "ncV:64:", "3dnowa") - // MMX // // All MMX instructions will be generated via builtins. Any MMX vector diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index f1ddd2276e816a..f671b780bcbeb1 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -38,6 +38,7 @@ VALUE_CODEGENOPT(Name, Bits, Default) CODEGENOPT(DisableIntegratedAS, 1, 0) ///< -no-integrated-as CODEGENOPT(Crel, 1, 0) ///< -Wa,--crel CODEGENOPT(RelaxELFRelocations, 1, 1) ///< -Wa,-mrelax-relocations={yes,no} +CODEGENOPT(SSE2AVX , 1, 0) ///< -msse2avx CODEGENOPT(AsmVerbose , 1, 0) ///< -dA, -fverbose-asm. CODEGENOPT(PreserveAsmComments, 1, 1) ///< -dA, -fno-preserve-as-comments. CODEGENOPT(AssumeSaneOperatorNew , 1, 1) ///< implicit __attribute__((malloc)) operator new diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index e00cd47411cb31..12aab09f285567 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -895,9 +895,10 @@ def missing_template_arg_list_after_template_kw : Extension< "keyword">, InGroup>, DefaultError; -def ext_missing_dependent_template_keyword : ExtWarn< - "use 'template' keyword to treat '%0' as a dependent template name">, - InGroup>; +def err_missing_dependent_template_keyword : Error< + "use 'template' keyword to treat '%0' as a dependent template name">; +def warn_missing_dependent_template_keyword : ExtWarn< + "use 'template' keyword to treat '%0' as a dependent template name">; def ext_extern_template : Extension< "extern templates are a C++11 extension">, InGroup; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0ea3677355169f..52ff4b026a60e2 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7577,6 +7577,8 @@ def err_explicit_object_lambda_ambiguous_base : Error< def err_explicit_object_lambda_inaccessible_base : Error< "invalid explicit object parameter type %0 in lambda with capture; " "the type must derive publicly from the lambda">; +def err_explicit_object_parameter_invalid: Error< + "an explicit object parameter can only appear as the first parameter of a member function">; def err_ref_qualifier_overload : Error< "cannot overload a member function %select{without a ref-qualifier|with " diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index e6054425909098..7f4912b9bcd961 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -165,6 +165,9 @@ TOK(raw_identifier) // Used only in raw lexing mode. // C99 6.4.4.2: Floating Constants TOK(numeric_constant) // 0x123 +// Directly holds numerical value. Used to process C23 #embed. +TOK(binary_data) + // C99 6.4.4: Character Constants TOK(char_constant) // 'a' TOK(wide_char_constant) // L'b' diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index e5183a27d2bc5f..1b133dde895876 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -98,7 +98,7 @@ inline bool isLiteral(TokenKind K) { return K == tok::numeric_constant || K == tok::char_constant || K == tok::wide_char_constant || K == tok::utf8_char_constant || K == tok::utf16_char_constant || K == tok::utf32_char_constant || - isStringLiteral(K) || K == tok::header_name; + isStringLiteral(K) || K == tok::header_name || K == tok::binary_data; } /// Return true if this is any of tok::annot_* kinds. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index f1e8cb87e5321a..2400b193d4d38c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4854,6 +4854,14 @@ def mstrict_align : Flag<["-"], "mstrict-align">, Group, HelpText<"Force all memory accesses to be aligned (AArch64/LoongArch/RISC-V only)">; def mno_strict_align : Flag<["-"], "mno-strict-align">, Group, HelpText<"Allow memory accesses to be unaligned (AArch64/LoongArch/RISC-V only)">; +def mscalar_strict_align : Flag<["-"], "mscalar-strict-align">, Group, + HelpText<"Force all scalar memory accesses to be aligned (RISC-V only)">; +def mno_scalar_strict_align : Flag<["-"], "mno-scalar-strict-align">, Group, + HelpText<"Allow scalar memory accesses to be unaligned (RISC-V only)">; +def mvector_strict_align : Flag<["-"], "mvector-strict-align">, Group, + HelpText<"Force all vector memory accesses to be aligned (RISC-V only)">; +def mno_vector_strict_align : Flag<["-"], "mno-vector-strict-align">, Group, + HelpText<"Allow vector memory accesses to be unaligned (RISC-V only)">; def mno_thumb : Flag<["-"], "mno-thumb">, Group; def mrestrict_it: Flag<["-"], "mrestrict-it">, Group, HelpText<"Disallow generation of complex IT blocks. It is off by default.">; @@ -5171,6 +5179,13 @@ def mvx : Flag<["-"], "mvx">, Group; def mno_vx : Flag<["-"], "mno-vx">, Group; } // let Flags = [TargetSpecific] +let Flags = [TargetSpecific] in { +def msse2avx : Flag<["-"], "msse2avx">, Group, + Visibility<[ClangOption, CC1Option, CC1AsOption]>, + HelpText<"Specify that the assembler should encode SSE instructions with VEX prefix">, + MarshallingInfoFlag>; +} // let Flags = [TargetSpecific] + defm zvector : BoolFOption<"zvector", LangOpts<"ZVector">, DefaultFalse, PosFlag, Flags<[]>, HelpText<"Provide information about a particular module file">; def mthumb : Flag<["-"], "mthumb">, Group; def mtune_EQ : Joined<["-"], "mtune=">, Group, + Visibility<[ClangOption, FlangOption]>, HelpText<"Only supported on AArch64, PowerPC, RISC-V, SPARC, SystemZ, and X86">; def multi__module : Flag<["-"], "multi_module">; def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">; @@ -6119,10 +6135,6 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias; def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias; def mmmx : Flag<["-"], "mmmx">, Group; def mno_mmx : Flag<["-"], "mno-mmx">, Group; -def m3dnow : Flag<["-"], "m3dnow">, Group; -def mno_3dnow : Flag<["-"], "mno-3dnow">, Group; -def m3dnowa : Flag<["-"], "m3dnowa">, Group; -def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group; def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group; def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group; def mamx_complex : Flag<["-"], "mamx-complex">, Group; @@ -6356,6 +6368,12 @@ def mvevpu : Flag<["-"], "mvevpu">, Group, def mno_vevpu : Flag<["-"], "mno-vevpu">, Group; } // let Flags = [TargetSpecific] +// Unsupported X86 feature flags (triggers a warning) +def m3dnow : Flag<["-"], "m3dnow">; +def mno_3dnow : Flag<["-"], "mno-3dnow">; +def m3dnowa : Flag<["-"], "m3dnowa">; +def mno_3dnowa : Flag<["-"], "mno-3dnowa">; + // These are legacy user-facing driver-level option spellings. They are always // aliases for options that are spelled using the more common Unix / GNU flag // style of double-dash and equals-joined flags. @@ -6768,9 +6786,6 @@ def emit_hlfir : Flag<["-"], "emit-hlfir">, Group, let Visibility = [CC1Option, CC1AsOption] in { -def tune_cpu : Separate<["-"], "tune-cpu">, - HelpText<"Tune for a specific cpu type">, - MarshallingInfoString>; def target_abi : Separate<["-"], "target-abi">, HelpText<"Target a particular ABI type">, MarshallingInfoString>; @@ -6797,6 +6812,9 @@ def darwin_target_variant_triple : Separate<["-"], "darwin-target-variant-triple let Visibility = [CC1Option, CC1AsOption, FC1Option] in { +def tune_cpu : Separate<["-"], "tune-cpu">, + HelpText<"Tune for a specific cpu type">, + MarshallingInfoString>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString>; diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h index 76d7fd798bed3a..1b27027621666a 100644 --- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h +++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h @@ -175,22 +175,25 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor { // skip classes not inherited as public if (BaseSpecifier.getAccessSpecifier() != AccessSpecifier::AS_public) continue; - SymbolReference BaseClass; - if (BaseSpecifier.getType().getTypePtr()->isTemplateTypeParmType()) { - BaseClass.Name = API.copyString(BaseSpecifier.getType().getAsString()); - if (auto *TTPTD = BaseSpecifier.getType() - ->getAs() - ->getDecl()) { - SmallString<128> USR; - index::generateUSRForDecl(TTPTD, USR); - BaseClass.USR = API.copyString(USR); - BaseClass.Source = API.copyString(getOwningModuleName(*TTPTD)); - } + if (auto *BaseDecl = BaseSpecifier.getType()->getAsTagDecl()) { + Bases.emplace_back(createSymbolReferenceForDecl(*BaseDecl)); } else { - BaseClass = createSymbolReferenceForDecl( - *BaseSpecifier.getType().getTypePtr()->getAsCXXRecordDecl()); + SymbolReference BaseClass; + BaseClass.Name = API.copyString(BaseSpecifier.getType().getAsString( + Decl->getASTContext().getPrintingPolicy())); + + if (BaseSpecifier.getType().getTypePtr()->isTemplateTypeParmType()) { + if (auto *TTPTD = BaseSpecifier.getType() + ->getAs() + ->getDecl()) { + SmallString<128> USR; + index::generateUSRForDecl(TTPTD, USR); + BaseClass.USR = API.copyString(USR); + BaseClass.Source = API.copyString(getOwningModuleName(*TTPTD)); + } + } + Bases.emplace_back(BaseClass); } - Bases.emplace_back(BaseClass); } return Bases; } @@ -352,7 +355,7 @@ bool ExtractAPIVisitorBase::VisitFunctionDecl( return true; // Collect symbol information. - StringRef Name = Decl->getName(); + auto Name = Decl->getNameAsString(); SmallString<128> USR; index::generateUSRForDecl(Decl, USR); PresumedLoc Loc = @@ -666,8 +669,8 @@ bool ExtractAPIVisitorBase::VisitCXXMethodDecl( if (FunctionTemplateDecl *TemplateDecl = Decl->getDescribedFunctionTemplate()) { API.createRecord( - USR, Decl->getName(), createHierarchyInformationForDecl(*Decl), Loc, - AvailabilityInfo::createFromDecl(Decl), Comment, + USR, Decl->getNameAsString(), createHierarchyInformationForDecl(*Decl), + Loc, AvailabilityInfo::createFromDecl(Decl), Comment, DeclarationFragmentsBuilder::getFragmentsForFunctionTemplate( TemplateDecl), SubHeading, DeclarationFragmentsBuilder::getFunctionSignature(Decl), @@ -675,8 +678,8 @@ bool ExtractAPIVisitorBase::VisitCXXMethodDecl( Template(TemplateDecl), isInSystemHeader(Decl)); } else if (Decl->getTemplateSpecializationInfo()) API.createRecord( - USR, Decl->getName(), createHierarchyInformationForDecl(*Decl), Loc, - AvailabilityInfo::createFromDecl(Decl), Comment, + USR, Decl->getNameAsString(), createHierarchyInformationForDecl(*Decl), + Loc, AvailabilityInfo::createFromDecl(Decl), Comment, DeclarationFragmentsBuilder:: getFragmentsForFunctionTemplateSpecialization(Decl), SubHeading, Signature, Access, isInSystemHeader(Decl)); @@ -688,14 +691,14 @@ bool ExtractAPIVisitorBase::VisitCXXMethodDecl( SubHeading, Signature, Access, isInSystemHeader(Decl)); else if (Decl->isStatic()) API.createRecord( - USR, Decl->getName(), createHierarchyInformationForDecl(*Decl), Loc, - AvailabilityInfo::createFromDecl(Decl), Comment, + USR, Decl->getNameAsString(), createHierarchyInformationForDecl(*Decl), + Loc, AvailabilityInfo::createFromDecl(Decl), Comment, DeclarationFragmentsBuilder::getFragmentsForCXXMethod(Decl), SubHeading, Signature, Access, isInSystemHeader(Decl)); else API.createRecord( - USR, Decl->getName(), createHierarchyInformationForDecl(*Decl), Loc, - AvailabilityInfo::createFromDecl(Decl), Comment, + USR, Decl->getNameAsString(), createHierarchyInformationForDecl(*Decl), + Loc, AvailabilityInfo::createFromDecl(Decl), Comment, DeclarationFragmentsBuilder::getFragmentsForCXXMethod(Decl), SubHeading, Signature, Access, isInSystemHeader(Decl)); @@ -977,7 +980,7 @@ bool ExtractAPIVisitorBase::VisitFunctionTemplateDecl( return true; // Collect symbol information. - StringRef Name = Decl->getName(); + auto Name = Decl->getNameAsString(); SmallString<128> USR; index::generateUSRForDecl(Decl, USR); PresumedLoc Loc = diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index be3334b9807463..fc7d0053f2323b 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -2123,17 +2123,18 @@ class Preprocessor { char getSpellingOfSingleCharacterNumericConstant(const Token &Tok, bool *Invalid = nullptr) const { - assert(Tok.is(tok::numeric_constant) && + assert((Tok.is(tok::numeric_constant) || Tok.is(tok::binary_data)) && Tok.getLength() == 1 && "Called on unsupported token"); assert(!Tok.needsCleaning() && "Token can't need cleaning with length 1"); // If the token is carrying a literal data pointer, just use it. if (const char *D = Tok.getLiteralData()) - return *D; + return (Tok.getKind() == tok::binary_data) ? *D : *D - '0'; + assert(Tok.is(tok::numeric_constant) && "binary data with no data"); // Otherwise, fall back on getCharacterData, which is slower, but always // works. - return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid); + return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid) - '0'; } /// Retrieve the name of the immediate macro expansion. diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index bb3f08aef0378b..93e60be512aae0 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -2127,7 +2127,7 @@ class Parser : public CodeCompletionHandler { }; ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo); ExprResult createEmbedExpr(); - void ExpandEmbedDirective(SmallVectorImpl &Exprs); + void injectEmbedTokens(); //===--------------------------------------------------------------------===// // clang Expressions @@ -3372,11 +3372,15 @@ class Parser : public CodeCompletionHandler { BaseResult ParseBaseSpecifier(Decl *ClassDecl); AccessSpecifier getAccessSpecifierIfPresent() const; - bool ParseUnqualifiedIdTemplateId( - CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, - SourceLocation TemplateKWLoc, SourceLocation TildeLoc, - IdentifierInfo *Name, SourceLocation NameLoc, bool EnteringContext, - UnqualifiedId &Id, bool AssumeTemplateId); + bool ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, + ParsedType ObjectType, + bool ObjectHadErrors, + SourceLocation TemplateKWLoc, + IdentifierInfo *Name, + SourceLocation NameLoc, + bool EnteringContext, + UnqualifiedId &Id, + bool AssumeTemplateId); bool ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, ParsedType ObjectType, UnqualifiedId &Result); @@ -3830,7 +3834,6 @@ class Parser : public CodeCompletionHandler { AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS, ImplicitTypenameContext AllowImplicitTypename, bool IsClassName = false); - void ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs); bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs, TemplateTy Template, SourceLocation OpenLoc); ParsedTemplateArgument ParseTemplateTemplateArgument(); diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 9c22c35535ede7..425b6e2a0b30c9 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -75,7 +75,6 @@ class CXXScopeSpec { SourceRange Range; NestedNameSpecifierLocBuilder Builder; ArrayRef TemplateParamLists; - ArrayRef UnqualifiedLookups; public: SourceRange getRange() const { return Range; } @@ -92,13 +91,6 @@ class CXXScopeSpec { return TemplateParamLists; } - void setUnqualifiedLookups(ArrayRef Found) { - UnqualifiedLookups = Found; - } - ArrayRef getUnqualifiedLookups() const { - return UnqualifiedLookups; - } - /// Retrieve the representation of the nested-name-specifier. NestedNameSpecifier *getScopeRep() const { return Builder.getRepresentation(); diff --git a/clang/include/clang/Sema/HLSLExternalSemaSource.h b/clang/include/clang/Sema/HLSLExternalSemaSource.h index c0bfff327139f8..3c7495e66055dc 100644 --- a/clang/include/clang/Sema/HLSLExternalSemaSource.h +++ b/clang/include/clang/Sema/HLSLExternalSemaSource.h @@ -23,7 +23,6 @@ class Sema; class HLSLExternalSemaSource : public ExternalSemaSource { Sema *SemaPtr = nullptr; NamespaceDecl *HLSLNamespace = nullptr; - CXXRecordDecl *ResourceDecl = nullptr; using CompletionFunction = std::function; llvm::DenseMap Completions; diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h index 6b765ef3c980f6..b0a08a05ac6a0a 100644 --- a/clang/include/clang/Sema/Lookup.h +++ b/clang/include/clang/Sema/Lookup.h @@ -483,15 +483,11 @@ class LookupResult { ResultKind = Found; } - void addAllDecls(ArrayRef Other) { - Decls.addAllDecls(Other); - ResultKind = Found; - } - /// Add all the declarations from another set of lookup /// results. void addAllDecls(const LookupResult &Other) { - addAllDecls(Other.Decls.pairs()); + Decls.append(Other.Decls.begin(), Other.Decls.end()); + ResultKind = Found; } /// Determine whether no result was found because we could not diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 6be6f6725e5b75..48dff1b76cc57f 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -560,13 +560,14 @@ class Sema final : public SemaBase { // 23. Statement Attribute Handling (SemaStmtAttr.cpp) // 24. C++ Templates (SemaTemplate.cpp) // 25. C++ Template Argument Deduction (SemaTemplateDeduction.cpp) - // 26. C++ Template Instantiation (SemaTemplateInstantiate.cpp) - // 27. C++ Template Declaration Instantiation + // 26. C++ Template Deduction Guide (SemaTemplateDeductionGuide.cpp) + // 27. C++ Template Instantiation (SemaTemplateInstantiate.cpp) + // 28. C++ Template Declaration Instantiation // (SemaTemplateInstantiateDecl.cpp) - // 28. C++ Variadic Templates (SemaTemplateVariadic.cpp) - // 29. Constraints and Concepts (SemaConcept.cpp) - // 30. Types (SemaType.cpp) - // 31. FixIt Helpers (SemaFixItUtils.cpp) + // 29. C++ Variadic Templates (SemaTemplateVariadic.cpp) + // 30. Constraints and Concepts (SemaConcept.cpp) + // 31. Types (SemaType.cpp) + // 32. FixIt Helpers (SemaFixItUtils.cpp) /// \name Semantic Analysis /// Implementations are in Sema.cpp @@ -2802,8 +2803,7 @@ class Sema final : public SemaBase { /// (e.g., Base::), perform name lookup for that identifier as a /// nested-name-specifier within the given scope, and return the result of /// that name lookup. - bool LookupFirstQualifierInScope(Scope *S, NestedNameSpecifier *NNS, - UnresolvedSetImpl &R); + NamedDecl *FindFirstQualifierInScope(Scope *S, NestedNameSpecifier *NNS); /// Keeps information about an identifier in a nested-name-spec. /// @@ -2843,6 +2843,9 @@ class Sema final : public SemaBase { /// \param EnteringContext If true, enter the context specified by the /// nested-name-specifier. /// \param SS Optional nested name specifier preceding the identifier. + /// \param ScopeLookupResult Provides the result of name lookup within the + /// scope of the nested-name-specifier that was computed at template + /// definition time. /// \param ErrorRecoveryLookup Specifies if the method is called to improve /// error recovery and what kind of recovery is performed. /// \param IsCorrectedToColon If not null, suggestion of replace '::' -> ':' @@ -2851,6 +2854,11 @@ class Sema final : public SemaBase { /// not '::'. /// \param OnlyNamespace If true, only considers namespaces in lookup. /// + /// This routine differs only slightly from ActOnCXXNestedNameSpecifier, in + /// that it contains an extra parameter \p ScopeLookupResult, which provides + /// the result of name lookup within the scope of the nested-name-specifier + /// that was computed at template definition time. + /// /// If ErrorRecoveryLookup is true, then this call is used to improve error /// recovery. This means that it should not emit diagnostics, it should /// just return true on failure. It also means it should only return a valid @@ -2859,6 +2867,7 @@ class Sema final : public SemaBase { /// specifier. bool BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, bool EnteringContext, CXXScopeSpec &SS, + NamedDecl *ScopeLookupResult, bool ErrorRecoveryLookup, bool *IsCorrectedToColon = nullptr, bool OnlyNamespace = false); @@ -8558,12 +8567,11 @@ class Sema final : public SemaBase { const TemplateArgumentListInfo *TemplateArgs, bool IsDefiniteInstance, const Scope *S); - ExprResult - ActOnDependentMemberExpr(Expr *Base, QualType BaseType, bool IsArrow, - SourceLocation OpLoc, const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - const DeclarationNameInfo &NameInfo, - const TemplateArgumentListInfo *TemplateArgs); + ExprResult ActOnDependentMemberExpr( + Expr *Base, QualType BaseType, bool IsArrow, SourceLocation OpLoc, + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo, + const TemplateArgumentListInfo *TemplateArgs); /// The main callback when the parser finds something like /// expression . [nested-name-specifier] identifier @@ -8619,14 +8627,15 @@ class Sema final : public SemaBase { ExprResult BuildMemberReferenceExpr( Expr *Base, QualType BaseType, SourceLocation OpLoc, bool IsArrow, CXXScopeSpec &SS, SourceLocation TemplateKWLoc, - const DeclarationNameInfo &NameInfo, + NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo, const TemplateArgumentListInfo *TemplateArgs, const Scope *S, ActOnMemberAccessExtraArgs *ExtraArgs = nullptr); ExprResult BuildMemberReferenceExpr(Expr *Base, QualType BaseType, SourceLocation OpLoc, bool IsArrow, const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, LookupResult &R, + SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, LookupResult &R, const TemplateArgumentListInfo *TemplateArgs, const Scope *S, bool SuppressQualifierCheck = false, ActOnMemberAccessExtraArgs *ExtraArgs = nullptr); @@ -11114,14 +11123,15 @@ class Sema final : public SemaBase { QualType ObjectType, bool EnteringContext, RequiredTemplateKind RequiredTemplate = SourceLocation(), AssumedTemplateKind *ATK = nullptr, - bool AllowTypoCorrection = true, bool MayBeNNS = false); + bool AllowTypoCorrection = true); - TemplateNameKind - isTemplateName(Scope *S, CXXScopeSpec &SS, bool hasTemplateKeyword, - const UnqualifiedId &Name, ParsedType ObjectType, - bool EnteringContext, TemplateTy &Template, - bool &MemberOfUnknownSpecialization, - bool Disambiguation = false, bool MayBeNNS = false); + TemplateNameKind isTemplateName(Scope *S, CXXScopeSpec &SS, + bool hasTemplateKeyword, + const UnqualifiedId &Name, + ParsedType ObjectType, bool EnteringContext, + TemplateTy &Template, + bool &MemberOfUnknownSpecialization, + bool Disambiguation = false); /// Try to resolve an undeclared template name as a type template. /// @@ -11347,6 +11357,10 @@ class Sema final : public SemaBase { bool &IsMemberSpecialization, bool &Invalid, bool SuppressDiagnostic = false); + /// Returns the template parameter list with all default template argument + /// information. + TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD); + DeclResult CheckClassTemplate( Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, CXXScopeSpec &SS, IdentifierInfo *Name, SourceLocation NameLoc, @@ -11450,11 +11464,12 @@ class Sema final : public SemaBase { /// For example, given "x.MetaFun::template apply", the scope specifier /// \p SS will be "MetaFun::", \p TemplateKWLoc contains the location /// of the "template" keyword, and "apply" is the \p Name. - TemplateNameKind - ActOnTemplateName(Scope *S, CXXScopeSpec &SS, SourceLocation TemplateKWLoc, - const UnqualifiedId &Name, ParsedType ObjectType, - bool EnteringContext, TemplateTy &Template, - bool AllowInjectedClassName = false, bool MayBeNNS = false); + TemplateNameKind ActOnTemplateName(Scope *S, CXXScopeSpec &SS, + SourceLocation TemplateKWLoc, + const UnqualifiedId &Name, + ParsedType ObjectType, + bool EnteringContext, TemplateTy &Template, + bool AllowInjectedClassName = false); DeclResult ActOnClassTemplateSpecialization( Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, @@ -12009,15 +12024,6 @@ class Sema final : public SemaBase { unsigned TemplateDepth, const Expr *Constraint); - /// Declare implicit deduction guides for a class template if we've - /// not already done so. - void DeclareImplicitDeductionGuides(TemplateDecl *Template, - SourceLocation Loc); - - FunctionTemplateDecl *DeclareAggregateDeductionGuideFromInitList( - TemplateDecl *Template, MutableArrayRef ParamTypes, - SourceLocation Loc); - /// Find the failed Boolean condition within a given Boolean /// constant expression, and describe it with a string. std::pair findFailedBooleanCondition(Expr *Cond); @@ -12570,6 +12576,27 @@ class Sema final : public SemaBase { // // + /// \name C++ Template Deduction Guide + /// Implementations are in SemaTemplateDeductionGuide.cpp + ///@{ + + /// Declare implicit deduction guides for a class template if we've + /// not already done so. + void DeclareImplicitDeductionGuides(TemplateDecl *Template, + SourceLocation Loc); + + FunctionTemplateDecl *DeclareAggregateDeductionGuideFromInitList( + TemplateDecl *Template, MutableArrayRef ParamTypes, + SourceLocation Loc); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + /// \name C++ Template Instantiation /// Implementations are in SemaTemplateInstantiate.cpp ///@{ diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 497579dcc56b6e..6c89e3890ae3e8 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -7250,14 +7250,14 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const { // A namespace is canonical; build a nested-name-specifier with // this namespace and no prefix. return NestedNameSpecifier::Create(*this, nullptr, - NNS->getAsNamespace()->getOriginalNamespace()); + NNS->getAsNamespace()->getFirstDecl()); case NestedNameSpecifier::NamespaceAlias: // A namespace is canonical; build a nested-name-specifier with // this namespace and no prefix. - return NestedNameSpecifier::Create(*this, nullptr, - NNS->getAsNamespaceAlias()->getNamespace() - ->getOriginalNamespace()); + return NestedNameSpecifier::Create( + *this, nullptr, + NNS->getAsNamespaceAlias()->getNamespace()->getFirstDecl()); // The difference between TypeSpec and TypeSpecWithTemplate is that the // latter will have the 'template' keyword when printed. diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 9bb035c07b8ae1..4e1b3a5a94de76 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -8439,14 +8439,8 @@ ExpectedStmt ASTNodeImporter::VisitCXXDependentScopeMemberExpr( auto ToOperatorLoc = importChecked(Err, E->getOperatorLoc()); auto ToQualifierLoc = importChecked(Err, E->getQualifierLoc()); auto ToTemplateKeywordLoc = importChecked(Err, E->getTemplateKeywordLoc()); - - UnresolvedSet<8> ToUnqualifiedLookups; - for (auto D : E->unqualified_lookups()) - if (auto ToDOrErr = import(D.getDecl())) - ToUnqualifiedLookups.addDecl(*ToDOrErr); - else - return ToDOrErr.takeError(); - + auto ToFirstQualifierFoundInScope = + importChecked(Err, E->getFirstQualifierFoundInScope()); if (Err) return std::move(Err); @@ -8480,7 +8474,7 @@ ExpectedStmt ASTNodeImporter::VisitCXXDependentScopeMemberExpr( return CXXDependentScopeMemberExpr::Create( Importer.getToContext(), ToBase, ToType, E->isArrow(), ToOperatorLoc, - ToQualifierLoc, ToTemplateKeywordLoc, ToUnqualifiedLookups.pairs(), + ToQualifierLoc, ToTemplateKeywordLoc, ToFirstQualifierFoundInScope, ToMemberNameInfo, ResInfo); } diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index ef2c57e6204dc8..bc5a9206c0db28 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1422,8 +1422,7 @@ DeclContext *DeclContext::getPrimaryContext() { case Decl::TranslationUnit: return static_cast(this)->getFirstDecl(); case Decl::Namespace: - // The original namespace is our primary context. - return static_cast(this)->getOriginalNamespace(); + return static_cast(this)->getFirstDecl(); case Decl::ObjCMethod: return this; diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index d5c140fd343895..72d68f39a97a53 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -2941,7 +2941,7 @@ UsingDirectiveDecl *UsingDirectiveDecl::Create(ASTContext &C, DeclContext *DC, NamedDecl *Used, DeclContext *CommonAncestor) { if (auto *NS = dyn_cast_or_null(Used)) - Used = NS->getOriginalNamespace(); + Used = NS->getFirstDecl(); return new (C, DC) UsingDirectiveDecl(DC, L, NamespaceLoc, QualifierLoc, IdentLoc, Used, CommonAncestor); } @@ -2966,16 +2966,9 @@ NamespaceDecl::NamespaceDecl(ASTContext &C, DeclContext *DC, bool Inline, bool Nested) : NamedDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace), redeclarable_base(C), LocStart(StartLoc) { - unsigned Flags = 0; - if (Inline) - Flags |= F_Inline; - if (Nested) - Flags |= F_Nested; - AnonOrFirstNamespaceAndFlags = {nullptr, Flags}; + setInline(Inline); + setNested(Nested); setPreviousDecl(PrevDecl); - - if (PrevDecl) - AnonOrFirstNamespaceAndFlags.setPointer(PrevDecl->getOriginalNamespace()); } NamespaceDecl *NamespaceDecl::Create(ASTContext &C, DeclContext *DC, @@ -2992,22 +2985,6 @@ NamespaceDecl *NamespaceDecl::CreateDeserialized(ASTContext &C, SourceLocation(), nullptr, nullptr, false); } -NamespaceDecl *NamespaceDecl::getOriginalNamespace() { - if (isFirstDecl()) - return this; - - return AnonOrFirstNamespaceAndFlags.getPointer(); -} - -const NamespaceDecl *NamespaceDecl::getOriginalNamespace() const { - if (isFirstDecl()) - return this; - - return AnonOrFirstNamespaceAndFlags.getPointer(); -} - -bool NamespaceDecl::isOriginalNamespace() const { return isFirstDecl(); } - NamespaceDecl *NamespaceDecl::getNextRedeclarationImpl() { return getNextRedeclaration(); } @@ -3043,7 +3020,7 @@ NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC, NamedDecl *Namespace) { // FIXME: Preserve the aliased namespace as written. if (auto *NS = dyn_cast_or_null(Namespace)) - Namespace = NS->getOriginalNamespace(); + Namespace = NS->getFirstDecl(); return new (C, DC) NamespaceAliasDecl(C, DC, UsingLoc, AliasLoc, Alias, QualifierLoc, IdentLoc, Namespace); } diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 9d2883a8debb72..8d2a1b5611ccc6 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1489,27 +1489,19 @@ SourceLocation CXXUnresolvedConstructExpr::getBeginLoc() const { CXXDependentScopeMemberExpr::CXXDependentScopeMemberExpr( const ASTContext &Ctx, Expr *Base, QualType BaseType, bool IsArrow, SourceLocation OperatorLoc, NestedNameSpecifierLoc QualifierLoc, - SourceLocation TemplateKWLoc, ArrayRef UnqualifiedLookups, + SourceLocation TemplateKWLoc, NamedDecl *FirstQualifierFoundInScope, DeclarationNameInfo MemberNameInfo, const TemplateArgumentListInfo *TemplateArgs) : Expr(CXXDependentScopeMemberExprClass, Ctx.DependentTy, VK_LValue, OK_Ordinary), - Base(Base), BaseType(BaseType), MemberNameInfo(MemberNameInfo), - OperatorLoc(OperatorLoc) { + Base(Base), BaseType(BaseType), QualifierLoc(QualifierLoc), + MemberNameInfo(MemberNameInfo) { CXXDependentScopeMemberExprBits.IsArrow = IsArrow; - CXXDependentScopeMemberExprBits.HasQualifier = QualifierLoc.hasQualifier(); - CXXDependentScopeMemberExprBits.NumUnqualifiedLookups = - UnqualifiedLookups.size(); CXXDependentScopeMemberExprBits.HasTemplateKWAndArgsInfo = (TemplateArgs != nullptr) || TemplateKWLoc.isValid(); - - if (hasQualifier()) - new (getTrailingObjects()) - NestedNameSpecifierLoc(QualifierLoc); - - std::uninitialized_copy_n(UnqualifiedLookups.data(), - UnqualifiedLookups.size(), - getTrailingObjects()); + CXXDependentScopeMemberExprBits.HasFirstQualifierFoundInScope = + FirstQualifierFoundInScope != nullptr; + CXXDependentScopeMemberExprBits.OperatorLoc = OperatorLoc; if (TemplateArgs) { auto Deps = TemplateArgumentDependence::None; @@ -1521,59 +1513,54 @@ CXXDependentScopeMemberExpr::CXXDependentScopeMemberExpr( TemplateKWLoc); } + if (hasFirstQualifierFoundInScope()) + *getTrailingObjects() = FirstQualifierFoundInScope; setDependence(computeDependence(this)); } CXXDependentScopeMemberExpr::CXXDependentScopeMemberExpr( - EmptyShell Empty, bool HasQualifier, unsigned NumUnqualifiedLookups, - bool HasTemplateKWAndArgsInfo) + EmptyShell Empty, bool HasTemplateKWAndArgsInfo, + bool HasFirstQualifierFoundInScope) : Expr(CXXDependentScopeMemberExprClass, Empty) { - CXXDependentScopeMemberExprBits.HasQualifier = HasQualifier; - CXXDependentScopeMemberExprBits.NumUnqualifiedLookups = NumUnqualifiedLookups; CXXDependentScopeMemberExprBits.HasTemplateKWAndArgsInfo = HasTemplateKWAndArgsInfo; + CXXDependentScopeMemberExprBits.HasFirstQualifierFoundInScope = + HasFirstQualifierFoundInScope; } CXXDependentScopeMemberExpr *CXXDependentScopeMemberExpr::Create( const ASTContext &Ctx, Expr *Base, QualType BaseType, bool IsArrow, SourceLocation OperatorLoc, NestedNameSpecifierLoc QualifierLoc, - SourceLocation TemplateKWLoc, ArrayRef UnqualifiedLookups, + SourceLocation TemplateKWLoc, NamedDecl *FirstQualifierFoundInScope, DeclarationNameInfo MemberNameInfo, const TemplateArgumentListInfo *TemplateArgs) { - bool HasQualifier = QualifierLoc.hasQualifier(); - unsigned NumUnqualifiedLookups = UnqualifiedLookups.size(); - assert(!NumUnqualifiedLookups || HasQualifier); bool HasTemplateKWAndArgsInfo = (TemplateArgs != nullptr) || TemplateKWLoc.isValid(); unsigned NumTemplateArgs = TemplateArgs ? TemplateArgs->size() : 0; - unsigned Size = - totalSizeToAlloc( - HasQualifier, NumUnqualifiedLookups, HasTemplateKWAndArgsInfo, - NumTemplateArgs); + bool HasFirstQualifierFoundInScope = FirstQualifierFoundInScope != nullptr; + + unsigned Size = totalSizeToAlloc( + HasTemplateKWAndArgsInfo, NumTemplateArgs, HasFirstQualifierFoundInScope); void *Mem = Ctx.Allocate(Size, alignof(CXXDependentScopeMemberExpr)); return new (Mem) CXXDependentScopeMemberExpr( Ctx, Base, BaseType, IsArrow, OperatorLoc, QualifierLoc, TemplateKWLoc, - UnqualifiedLookups, MemberNameInfo, TemplateArgs); + FirstQualifierFoundInScope, MemberNameInfo, TemplateArgs); } CXXDependentScopeMemberExpr *CXXDependentScopeMemberExpr::CreateEmpty( - const ASTContext &Ctx, bool HasQualifier, unsigned NumUnqualifiedLookups, - bool HasTemplateKWAndArgsInfo, unsigned NumTemplateArgs) { - assert(!NumTemplateArgs || HasTemplateKWAndArgsInfo); - assert(!NumUnqualifiedLookups || HasQualifier); + const ASTContext &Ctx, bool HasTemplateKWAndArgsInfo, + unsigned NumTemplateArgs, bool HasFirstQualifierFoundInScope) { + assert(NumTemplateArgs == 0 || HasTemplateKWAndArgsInfo); - unsigned Size = - totalSizeToAlloc( - HasQualifier, NumUnqualifiedLookups, HasTemplateKWAndArgsInfo, - NumTemplateArgs); + unsigned Size = totalSizeToAlloc( + HasTemplateKWAndArgsInfo, NumTemplateArgs, HasFirstQualifierFoundInScope); void *Mem = Ctx.Allocate(Size, alignof(CXXDependentScopeMemberExpr)); - return new (Mem) CXXDependentScopeMemberExpr(EmptyShell(), HasQualifier, - NumUnqualifiedLookups, - HasTemplateKWAndArgsInfo); + return new (Mem) CXXDependentScopeMemberExpr( + EmptyShell(), HasTemplateKWAndArgsInfo, HasFirstQualifierFoundInScope); } CXXThisExpr *CXXThisExpr::Create(const ASTContext &Ctx, SourceLocation L, diff --git a/clang/lib/AST/Interp/Boolean.h b/clang/lib/AST/Interp/Boolean.h index 336f7941dfc479..1bfb26b1b669f9 100644 --- a/clang/lib/AST/Interp/Boolean.h +++ b/clang/lib/AST/Interp/Boolean.h @@ -45,15 +45,10 @@ class Boolean final { Boolean operator-(const Boolean &Other) const { return Boolean(V - Other.V); } Boolean operator~() const { return Boolean(true); } - explicit operator int8_t() const { return V; } - explicit operator uint8_t() const { return V; } - explicit operator int16_t() const { return V; } - explicit operator uint16_t() const { return V; } - explicit operator int32_t() const { return V; } - explicit operator uint32_t() const { return V; } - explicit operator int64_t() const { return V; } - explicit operator uint64_t() const { return V; } - explicit operator bool() const { return V; } + template >> + explicit operator Ty() const { + return V; + } APSInt toAPSInt() const { return APSInt(APInt(1, static_cast(V), false), true); diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp index ae777d555e9165..17da77bc63c9bb 100644 --- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp +++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp @@ -93,6 +93,11 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) { // Set up lambda capture to closure record field mapping. if (isLambdaCallOperator(MD)) { + // The parent record needs to be complete, we need to know about all + // the lambda captures. + if (!MD->getParent()->isCompleteDefinition()) + return nullptr; + const Record *R = P.getOrCreateRecord(MD->getParent()); llvm::DenseMap LC; FieldDecl *LTC; diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp index 48e7519f8f89d7..30dc7f5e4840be 100644 --- a/clang/lib/AST/Interp/Compiler.cpp +++ b/clang/lib/AST/Interp/Compiler.cpp @@ -3073,13 +3073,13 @@ bool Compiler::VisitStmtExpr(const StmtExpr *E) { } assert(S == Result); - // This better produces a value (i.e. is an expression). if (const Expr *ResultExpr = dyn_cast(S)) { if (DiscardResult) return this->discard(ResultExpr); return this->delegate(ResultExpr); } - return false; + + return this->visitStmt(S); } return BS.destroyLocals(); @@ -3583,7 +3583,19 @@ VarCreationState Compiler::visitVarDecl(const VarDecl *VD, bool Topleve return checkDecl() && this->emitInitGlobal(*VarT, GlobalIndex, VD); } - return checkDecl() && this->visitGlobalInitializer(Init, GlobalIndex); + if (!checkDecl()) + return false; + + if (!this->emitGetPtrGlobal(GlobalIndex, Init)) + return false; + + if (!visitInitializer(Init)) + return false; + + if (!this->emitFinishInit(Init)) + return false; + + return this->emitPopPtr(Init); }; // We've already seen and initialized this global. @@ -3627,7 +3639,16 @@ VarCreationState Compiler::visitVarDecl(const VarDecl *VD, bool Topleve if (!Init) return true; - return this->visitLocalInitializer(Init, *Offset); + if (!this->emitGetPtrLocal(*Offset, Init)) + return false; + + if (!visitInitializer(Init)) + return false; + + if (!this->emitFinishInit(Init)) + return false; + + return this->emitPopPtr(Init); } return false; } @@ -4685,6 +4706,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { case UO_PostInc: { // x++ if (!Ctx.getLangOpts().CPlusPlus14) return this->emitInvalid(E); + if (!T) + return this->emitError(E); if (!this->visit(SubExpr)) return false; @@ -4706,6 +4729,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { case UO_PostDec: { // x-- if (!Ctx.getLangOpts().CPlusPlus14) return this->emitInvalid(E); + if (!T) + return this->emitError(E); if (!this->visit(SubExpr)) return false; @@ -4727,6 +4752,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { case UO_PreInc: { // ++x if (!Ctx.getLangOpts().CPlusPlus14) return this->emitInvalid(E); + if (!T) + return this->emitError(E); if (!this->visit(SubExpr)) return false; @@ -4774,6 +4801,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { case UO_PreDec: { // --x if (!Ctx.getLangOpts().CPlusPlus14) return this->emitInvalid(E); + if (!T) + return this->emitError(E); if (!this->visit(SubExpr)) return false; @@ -4819,6 +4848,9 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return E->isGLValue() || this->emitLoadPop(*T, E); } case UO_LNot: // !x + if (!T) + return this->emitError(E); + if (DiscardResult) return this->discard(SubExpr); @@ -4832,10 +4864,16 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return this->emitCast(PT_Bool, ET, E); return true; case UO_Minus: // -x + if (!T) + return this->emitError(E); + if (!this->visit(SubExpr)) return false; return DiscardResult ? this->emitPop(*T, E) : this->emitNeg(*T, E); case UO_Plus: // +x + if (!T) + return this->emitError(E); + if (!this->visit(SubExpr)) // noop return false; return DiscardResult ? this->emitPop(*T, E) : true; @@ -4852,6 +4890,9 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return this->discard(SubExpr); return this->visit(SubExpr); case UO_Not: // ~x + if (!T) + return this->emitError(E); + if (!this->visit(SubExpr)) return false; return DiscardResult ? this->emitPop(*T, E) : this->emitComp(*T, E); @@ -5094,9 +5135,10 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { if (E->getType()->isVoidType()) return true; // Convert the dummy pointer to another pointer type if we have to. - if (PrimType PT = classifyPrim(E); PT != PT_Ptr && isPtrType(PT)) { - if (!this->emitDecayPtr(PT_Ptr, PT, E)) - return false; + if (PrimType PT = classifyPrim(E); PT != PT_Ptr) { + if (isPtrType(PT)) + return this->emitDecayPtr(PT_Ptr, PT, E); + return false; } return true; } diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h index de873c7e6825f9..23e7afd767e881 100644 --- a/clang/lib/AST/Interp/Compiler.h +++ b/clang/lib/AST/Interp/Compiler.h @@ -278,45 +278,6 @@ class Compiler : public ConstStmtVisitor, bool>, /// Visits an expression and converts it to a boolean. bool visitBool(const Expr *E); - /// Visits an initializer for a local. - bool visitLocalInitializer(const Expr *Init, unsigned I) { - if (!this->emitGetPtrLocal(I, Init)) - return false; - - if (!visitInitializer(Init)) - return false; - - if (!this->emitFinishInit(Init)) - return false; - - return this->emitPopPtr(Init); - } - - /// Visits an initializer for a global. - bool visitGlobalInitializer(const Expr *Init, unsigned I) { - if (!this->emitGetPtrGlobal(I, Init)) - return false; - - if (!visitInitializer(Init)) - return false; - - if (!this->emitFinishInit(Init)) - return false; - - return this->emitPopPtr(Init); - } - - /// Visits a delegated initializer. - bool visitThisInitializer(const Expr *I) { - if (!this->emitThis(I)) - return false; - - if (!visitInitializer(I)) - return false; - - return this->emitFinishInitPop(I); - } - bool visitInitList(ArrayRef Inits, const Expr *ArrayFiller, const Expr *E); bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init); diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp index 913e8d514282ad..b5e992c5a9ac16 100644 --- a/clang/lib/AST/Interp/Context.cpp +++ b/clang/lib/AST/Interp/Context.cpp @@ -31,6 +31,9 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) { if (!Func || !Func->hasBody()) Func = Compiler(*this, *P).compileFunc(FD); + if (!Func) + return false; + APValue DummyResult; if (!Run(Parent, Func, DummyResult)) return false; diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp index 9701796fb93039..74413baf6fc0c9 100644 --- a/clang/lib/AST/Interp/EvalEmitter.cpp +++ b/clang/lib/AST/Interp/EvalEmitter.cpp @@ -152,6 +152,8 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { // Implicitly convert lvalue to rvalue, if requested. if (ConvertResultToRValue) { + if (!Ptr.isZero() && !Ptr.isDereferencable()) + return false; // Never allow reading from a non-const pointer, unless the memory // has been created in this evaluation. if (!Ptr.isConst() && Ptr.block()->getEvalID() != Ctx.getEvalID()) diff --git a/clang/lib/AST/Interp/Integral.h b/clang/lib/AST/Interp/Integral.h index cc1cab8f39fb1e..db4cc9ae45b491 100644 --- a/clang/lib/AST/Interp/Integral.h +++ b/clang/lib/AST/Interp/Integral.h @@ -98,10 +98,10 @@ template class Integral final { return Integral(V); } - explicit operator unsigned() const { return V; } - explicit operator int64_t() const { return V; } - explicit operator uint64_t() const { return V; } - explicit operator int32_t() const { return V; } + template >> + explicit operator Ty() const { + return V; + } APSInt toAPSInt() const { return APSInt(APInt(Bits, static_cast(V), Signed), !Signed); diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 0411fcad88ad0a..b673cc27aee21f 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -248,7 +248,8 @@ bool CheckExtern(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { if (!Ptr.isExtern()) return true; - if (Ptr.isInitialized()) + if (Ptr.isInitialized() || + (Ptr.getDeclDesc()->asVarDecl() == S.EvaluatingDecl)) return true; if (!S.checkingPotentialConstantExpression() && S.getLangOpts().CPlusPlus) { @@ -405,10 +406,16 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { // The This pointer is writable in constructors and destructors, // even if isConst() returns true. - if (const Function *Func = S.Current->getFunction(); - Func && (Func->isConstructor() || Func->isDestructor()) && - Ptr.block() == S.Current->getThis().block()) { - return true; + // TODO(perf): We could be hitting this code path quite a lot in complex + // constructors. Is there a better way to do this? + if (S.Current->getFunction()) { + for (const InterpFrame *Frame = S.Current; Frame; Frame = Frame->Caller) { + if (const Function *Func = Frame->getFunction(); + Func && (Func->isConstructor() || Func->isDestructor()) && + Ptr.block() == Frame->getThis().block()) { + return true; + } + } } if (!Ptr.isBlockPointer()) @@ -438,6 +445,27 @@ bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { return false; } +bool CheckVolatile(InterpState &S, CodePtr OpPC, const Pointer &Ptr, + AccessKinds AK) { + assert(Ptr.isLive()); + + // FIXME: This check here might be kinda expensive. Maybe it would be better + // to have another field in InlineDescriptor for this? + if (!Ptr.isBlockPointer()) + return true; + + QualType PtrType = Ptr.getType(); + if (!PtrType.isVolatileQualified()) + return true; + + const SourceInfo &Loc = S.Current->getSource(OpPC); + if (S.getLangOpts().CPlusPlus) + S.FFDiag(Loc, diag::note_constexpr_access_volatile_type) << AK << PtrType; + else + S.FFDiag(Loc); + return false; +} + bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr, AccessKinds AK) { assert(Ptr.isLive()); @@ -502,6 +530,8 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr, return false; if (!CheckMutable(S, OpPC, Ptr)) return false; + if (!CheckVolatile(S, OpPC, Ptr, AK)) + return false; return true; } diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 1df8d65c804454..c7d8604c7dc2ab 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -147,7 +147,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS, const APSInt Val = RHS.toAPSInt(); QualType Ty = E->getType(); S.CCEDiag(E, diag::note_constexpr_large_shift) << Val << Ty << Bits; - return true; // We will do the shift anyway but fix up the shift amount. + return !(S.getEvalStatus().Diag && !S.getEvalStatus().Diag->empty() && S.getLangOpts().CPlusPlus11); } if (LHS.isSigned() && !S.getLangOpts().CPlusPlus20) { @@ -302,15 +302,16 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS, auto Loc = E->getExprLoc(); S.report(Loc, diag::warn_integer_constant_overflow) << Trunc << Type << E->getSourceRange(); - return true; - } else { - S.CCEDiag(E, diag::note_constexpr_overflow) << Value << Type; - if (!S.noteUndefinedBehavior()) { - S.Stk.pop(); - return false; - } - return true; } + + S.CCEDiag(E, diag::note_constexpr_overflow) << Value << Type; + + if (!S.noteUndefinedBehavior()) { + S.Stk.pop(); + return false; + } + + return true; } template ::T> @@ -2208,13 +2209,10 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) { //===----------------------------------------------------------------------===// // Shr, Shl //===----------------------------------------------------------------------===// +enum class ShiftDir { Left, Right }; -template -inline bool Shr(InterpState &S, CodePtr OpPC) { - using LT = typename PrimConv::T; - using RT = typename PrimConv::T; - auto RHS = S.Stk.pop(); - const auto &LHS = S.Stk.pop(); +template +inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) { const unsigned Bits = LHS.bitWidth(); // OpenCL 6.3j: shift values are effectively % word size of LHS. @@ -2222,6 +2220,34 @@ inline bool Shr(InterpState &S, CodePtr OpPC) { RT::bitAnd(RHS, RT::from(LHS.bitWidth() - 1, RHS.bitWidth()), RHS.bitWidth(), &RHS); + if (RHS.isNegative()) { + // During constant-folding, a negative shift is an opposite shift. Such a + // shift is not a constant expression. + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt(); + if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag && + !S.getEvalStatus().Diag->empty()) + return false; + RHS = -RHS; + return DoShift < LT, RT, + Dir == ShiftDir::Left ? ShiftDir::Right + : ShiftDir::Left > (S, OpPC, LHS, RHS); + } + + if constexpr (Dir == ShiftDir::Left) { + if (LHS.isNegative() && !S.getLangOpts().CPlusPlus20) { + // C++11 [expr.shift]p2: A signed left shift must have a non-negative + // operand, and must not overflow the corresponding unsigned type. + // C++2a [expr.shift]p2: E1 << E2 is the unique value congruent to + // E1 x 2^E2 module 2^N. + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.CCEDiag(Loc, diag::note_constexpr_lshift_of_negative) << LHS.toAPSInt(); + if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag && + !S.getEvalStatus().Diag->empty()) + return false; + } + } + if (!CheckShift(S, OpPC, LHS, RHS, Bits)) return false; @@ -2229,45 +2255,44 @@ inline bool Shr(InterpState &S, CodePtr OpPC) { // it has already been diagnosed by CheckShift() above, // but we still need to handle it. typename LT::AsUnsigned R; - if (RHS > RT::from(Bits - 1, RHS.bitWidth())) - LT::AsUnsigned::shiftRight(LT::AsUnsigned::from(LHS), - LT::AsUnsigned::from(Bits - 1), Bits, &R); - else - LT::AsUnsigned::shiftRight(LT::AsUnsigned::from(LHS), - LT::AsUnsigned::from(RHS, Bits), Bits, &R); + if constexpr (Dir == ShiftDir::Left) { + if (RHS > RT::from(Bits - 1, RHS.bitWidth())) + LT::AsUnsigned::shiftLeft(LT::AsUnsigned::from(LHS), + LT::AsUnsigned::from(Bits - 1), Bits, &R); + else + LT::AsUnsigned::shiftLeft(LT::AsUnsigned::from(LHS), + LT::AsUnsigned::from(RHS, Bits), Bits, &R); + } else { + if (RHS > RT::from(Bits - 1, RHS.bitWidth())) + LT::AsUnsigned::shiftRight(LT::AsUnsigned::from(LHS), + LT::AsUnsigned::from(Bits - 1), Bits, &R); + else + LT::AsUnsigned::shiftRight(LT::AsUnsigned::from(LHS), + LT::AsUnsigned::from(RHS, Bits), Bits, &R); + } + S.Stk.push(LT::from(R)); return true; } template -inline bool Shl(InterpState &S, CodePtr OpPC) { +inline bool Shr(InterpState &S, CodePtr OpPC) { using LT = typename PrimConv::T; using RT = typename PrimConv::T; auto RHS = S.Stk.pop(); - const auto &LHS = S.Stk.pop(); - const unsigned Bits = LHS.bitWidth(); + auto LHS = S.Stk.pop(); - // OpenCL 6.3j: shift values are effectively % word size of LHS. - if (S.getLangOpts().OpenCL) - RT::bitAnd(RHS, RT::from(LHS.bitWidth() - 1, RHS.bitWidth()), - RHS.bitWidth(), &RHS); - - if (!CheckShift(S, OpPC, LHS, RHS, Bits)) - return false; + return DoShift(S, OpPC, LHS, RHS); +} - // Limit the shift amount to Bits - 1. If this happened, - // it has already been diagnosed by CheckShift() above, - // but we still need to handle it. - typename LT::AsUnsigned R; - if (RHS > RT::from(Bits - 1, RHS.bitWidth())) - LT::AsUnsigned::shiftLeft(LT::AsUnsigned::from(LHS), - LT::AsUnsigned::from(Bits - 1), Bits, &R); - else - LT::AsUnsigned::shiftLeft(LT::AsUnsigned::from(LHS), - LT::AsUnsigned::from(RHS, Bits), Bits, &R); +template +inline bool Shl(InterpState &S, CodePtr OpPC) { + using LT = typename PrimConv::T; + using RT = typename PrimConv::T; + auto RHS = S.Stk.pop(); + auto LHS = S.Stk.pop(); - S.Stk.push(LT::from(R)); - return true; + return DoShift(S, OpPC, LHS, RHS); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp index b33f74dfe99f1c..d3f3e216b7eb25 100644 --- a/clang/lib/AST/Interp/InterpFrame.cpp +++ b/clang/lib/AST/Interp/InterpFrame.cpp @@ -158,7 +158,9 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const { // diagnose them. The 'in call to' diagnostics for them add no value to the // user _and_ it doesn't generally work since the argument types don't always // match the function prototype. Just ignore them. - if (const auto *F = getFunction(); F && F->isBuiltin()) + // Similarly, for lambda static invokers, we would just print __invoke(). + if (const auto *F = getFunction(); + F && (F->isBuiltin() || F->isLambdaStaticInvoker())) return; const FunctionDecl *F = getCallee(); @@ -167,7 +169,10 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const { print(OS, This, S.getCtx(), S.getCtx().getRecordType(M->getParent())); OS << "->"; } - OS << *F << "("; + + F->getNameForDiagnostic(OS, S.getCtx().getPrintingPolicy(), + /*Qualified=*/false); + OS << '('; unsigned Off = 0; Off += Func->hasRVO() ? primSize(PT_Ptr) : 0; diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index 6e9e8675306ef1..28bc42985adb2a 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -584,6 +584,7 @@ class Pointer { assert(isLive() && "Invalid pointer"); assert(isBlockPointer()); assert(asBlockPointer().Pointee); + assert(isDereferencable()); assert(Offset + sizeof(T) <= asBlockPointer().Pointee->getDescriptor()->getAllocSize()); @@ -603,6 +604,17 @@ class Pointer { sizeof(InitMapPtr))[I]; } + /// Whether this block can be read from at all. This is only true for + /// block pointers that point to a valid location inside that block. + bool isDereferencable() const { + if (!isBlockPointer()) + return false; + if (isPastEnd()) + return false; + + return true; + } + /// Initializes a field. void initialize() const; /// Activats a field. diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index f14c4762bee6ff..40ef82785f4540 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -594,10 +594,11 @@ class CXXNameMangler { void mangleMemberExprBase(const Expr *base, bool isArrow); void mangleMemberExpr(const Expr *base, bool isArrow, NestedNameSpecifier *qualifier, - ArrayRef UnqualifiedLookups, + NamedDecl *firstQualifierLookup, DeclarationName name, const TemplateArgumentLoc *TemplateArgs, - unsigned NumTemplateArgs, unsigned knownArity); + unsigned NumTemplateArgs, + unsigned knownArity); void mangleCastExpression(const Expr *E, StringRef CastEncoding); void mangleInitListElements(const InitListExpr *InitList); void mangleRequirement(SourceLocation RequiresExprLoc, @@ -960,7 +961,7 @@ bool CXXNameMangler::isStd(const NamespaceDecl *NS) { if (!Context.getEffectiveParentContext(NS)->isTranslationUnit()) return false; - const IdentifierInfo *II = NS->getOriginalNamespace()->getIdentifier(); + const IdentifierInfo *II = NS->getFirstDecl()->getIdentifier(); return II && II->isStr("std"); } @@ -4495,11 +4496,14 @@ void CXXNameMangler::mangleMemberExprBase(const Expr *Base, bool IsArrow) { } /// Mangles a member expression. -void CXXNameMangler::mangleMemberExpr( - const Expr *base, bool isArrow, NestedNameSpecifier *qualifier, - ArrayRef UnqualifiedLookups, DeclarationName member, - const TemplateArgumentLoc *TemplateArgs, unsigned NumTemplateArgs, - unsigned arity) { +void CXXNameMangler::mangleMemberExpr(const Expr *base, + bool isArrow, + NestedNameSpecifier *qualifier, + NamedDecl *firstQualifierLookup, + DeclarationName member, + const TemplateArgumentLoc *TemplateArgs, + unsigned NumTemplateArgs, + unsigned arity) { // ::= dt // ::= pt if (base) @@ -4981,9 +4985,11 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, case Expr::MemberExprClass: { NotPrimaryExpr(); const MemberExpr *ME = cast(E); - mangleMemberExpr(ME->getBase(), ME->isArrow(), ME->getQualifier(), - std::nullopt, ME->getMemberDecl()->getDeclName(), - ME->getTemplateArgs(), ME->getNumTemplateArgs(), Arity); + mangleMemberExpr(ME->getBase(), ME->isArrow(), + ME->getQualifier(), nullptr, + ME->getMemberDecl()->getDeclName(), + ME->getTemplateArgs(), ME->getNumTemplateArgs(), + Arity); break; } @@ -4991,9 +4997,10 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, NotPrimaryExpr(); const UnresolvedMemberExpr *ME = cast(E); mangleMemberExpr(ME->isImplicitAccess() ? nullptr : ME->getBase(), - ME->isArrow(), ME->getQualifier(), std::nullopt, - ME->getMemberName(), ME->getTemplateArgs(), - ME->getNumTemplateArgs(), Arity); + ME->isArrow(), ME->getQualifier(), nullptr, + ME->getMemberName(), + ME->getTemplateArgs(), ME->getNumTemplateArgs(), + Arity); break; } @@ -5003,8 +5010,10 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, = cast(E); mangleMemberExpr(ME->isImplicitAccess() ? nullptr : ME->getBase(), ME->isArrow(), ME->getQualifier(), - ME->unqualified_lookups(), ME->getMember(), - ME->getTemplateArgs(), ME->getNumTemplateArgs(), Arity); + ME->getFirstQualifierFoundInScope(), + ME->getMember(), + ME->getTemplateArgs(), ME->getNumTemplateArgs(), + Arity); break; } diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index 339477dc65f0f7..eeb314b8d32b01 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -883,9 +883,8 @@ void JSONNodeDumper::VisitNamespaceDecl(const NamespaceDecl *ND) { VisitNamedDecl(ND); attributeOnlyIfTrue("isInline", ND->isInline()); attributeOnlyIfTrue("isNested", ND->isNested()); - if (!ND->isOriginalNamespace()) - JOS.attribute("originalNamespace", - createBareDeclRef(ND->getOriginalNamespace())); + if (!ND->isFirstDecl()) + JOS.attribute("originalNamespace", createBareDeclRef(ND->getFirstDecl())); } void JSONNodeDumper::VisitUsingDirectiveDecl(const UsingDirectiveDecl *UDD) { diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index a26f50f0719c11..5ba9523504258e 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -2386,8 +2386,8 @@ void TextNodeDumper::VisitNamespaceDecl(const NamespaceDecl *D) { OS << " inline"; if (D->isNested()) OS << " nested"; - if (!D->isOriginalNamespace()) - dumpDeclRef(D->getOriginalNamespace(), "original"); + if (!D->isFirstDecl()) + dumpDeclRef(D->getFirstDecl(), "original"); } void TextNodeDumper::VisitUsingDirectiveDecl(const UsingDirectiveDecl *D) { diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt index e7ebc8f191aa6b..f30680552e0f5b 100644 --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -102,6 +102,7 @@ add_clang_library(clangBasic Targets/DirectX.cpp Targets/Hexagon.cpp Targets/Lanai.cpp + Targets/Le64.cpp Targets/LoongArch.cpp Targets/M68k.cpp Targets/MSP430.cpp diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index 0b8e565345b6a4..29133f9ee8fcec 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -23,6 +23,7 @@ #include "Targets/DirectX.h" #include "Targets/Hexagon.h" #include "Targets/Lanai.h" +#include "Targets/Le64.h" #include "Targets/LoongArch.h" #include "Targets/M68k.h" #include "Targets/MSP430.h" @@ -343,6 +344,17 @@ std::unique_ptr AllocateTarget(const llvm::Triple &Triple, return std::make_unique(Triple, Opts); } + case llvm::Triple::le32: + switch (os) { + case llvm::Triple::NaCl: + return std::make_unique>(Triple, Opts); + default: + return nullptr; + } + + case llvm::Triple::le64: + return std::make_unique(Triple, Opts); + case llvm::Triple::ppc: switch (os) { case llvm::Triple::Linux: diff --git a/clang/lib/Basic/Targets/Le64.cpp b/clang/lib/Basic/Targets/Le64.cpp new file mode 100644 index 00000000000000..f7afa0e747d67b --- /dev/null +++ b/clang/lib/Basic/Targets/Le64.cpp @@ -0,0 +1,30 @@ +//===--- Le64.cpp - Implement Le64 target feature support -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements Le64 TargetInfo objects. +// +//===----------------------------------------------------------------------===// + +#include "Le64.h" +#include "Targets.h" +#include "clang/Basic/Builtins.h" +#include "clang/Basic/MacroBuilder.h" +#include "clang/Basic/TargetBuiltins.h" + +using namespace clang; +using namespace clang::targets; + +ArrayRef Le64TargetInfo::getTargetBuiltins() const { + return {}; +} + +void Le64TargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + DefineStd(Builder, "unix", Opts); + defineCPUMacros(Builder, "le64", /*Tuning=*/false); +} diff --git a/clang/lib/Basic/Targets/Le64.h b/clang/lib/Basic/Targets/Le64.h new file mode 100644 index 00000000000000..45f6a4e9dd75d8 --- /dev/null +++ b/clang/lib/Basic/Targets/Le64.h @@ -0,0 +1,64 @@ +//===--- Le64.h - Declare Le64 target feature support -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares Le64 TargetInfo objects. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_LE64_H +#define LLVM_CLANG_LIB_BASIC_TARGETS_LE64_H + +#include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" +#include "llvm/Support/Compiler.h" +#include "llvm/TargetParser/Triple.h" + +namespace clang { +namespace targets { + +class LLVM_LIBRARY_VISIBILITY Le64TargetInfo : public TargetInfo { + +public: + Le64TargetInfo(const llvm::Triple &Triple, const TargetOptions &) + : TargetInfo(Triple) { + NoAsmVariants = true; + LongWidth = LongAlign = PointerWidth = PointerAlign = 64; + MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64; + resetDataLayout("e-m:e-v128:32-v16:16-v32:32-v96:32-n8:16:32:64-S128"); + } + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; + + ArrayRef getTargetBuiltins() const override; + + BuiltinVaListKind getBuiltinVaListKind() const override { + return TargetInfo::PNaClABIBuiltinVaList; + } + + std::string_view getClobbers() const override { return ""; } + + ArrayRef getGCCRegNames() const override { + return std::nullopt; + } + + ArrayRef getGCCRegAliases() const override { + return std::nullopt; + } + + bool validateAsmConstraint(const char *&Name, + TargetInfo::ConstraintInfo &Info) const override { + return false; + } + + bool hasProtectedVisibility() const override { return false; } +}; + +} // namespace targets +} // namespace clang +#endif // LLVM_CLANG_LIB_BASIC_TARGETS_LE64_H diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index 0a4f06967fff5a..5f27c3469f861d 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -841,6 +841,9 @@ class LLVM_LIBRARY_VISIBILITY NaClTargetInfo : public OSTargetInfo { "i64:64-i128:128-n8:16:32:64-S128"); } else if (Triple.getArch() == llvm::Triple::mipsel) { // Handled on mips' setDataLayout. + } else { + assert(Triple.getArch() == llvm::Triple::le32); + this->resetDataLayout("e-p:32:32-i64:64"); } } }; diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index 25ae7d64b577e4..9159162f01d1bd 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -211,7 +211,7 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__riscv_v_fixed_vlen", Twine(VScale->first * llvm::RISCV::RVVBitsPerBlock)); - if (FastUnalignedAccess) + if (FastScalarUnalignedAccess) Builder.defineMacro("__riscv_misaligned_fast"); else Builder.defineMacro("__riscv_misaligned_avoid"); @@ -353,8 +353,8 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector &Features, if (ISAInfo->hasExtension("zfh") || ISAInfo->hasExtension("zhinx")) HasLegalHalfType = true; - FastUnalignedAccess = llvm::is_contained(Features, "+unaligned-scalar-mem") && - llvm::is_contained(Features, "+unaligned-vector-mem"); + FastScalarUnalignedAccess = + llvm::is_contained(Features, "+unaligned-scalar-mem"); if (llvm::is_contained(Features, "+experimental")) HasExperimental = true; diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h index d0e9cdc6da07b3..d5df6344bedc09 100644 --- a/clang/lib/Basic/Targets/RISCV.h +++ b/clang/lib/Basic/Targets/RISCV.h @@ -30,7 +30,7 @@ class RISCVTargetInfo : public TargetInfo { std::unique_ptr ISAInfo; private: - bool FastUnalignedAccess; + bool FastScalarUnalignedAccess; bool HasExperimental = false; public: diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 1f6fc842ddd955..121a2c2d795fec 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -258,7 +258,9 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, if (Feature[0] != '+') continue; - if (Feature == "+aes") { + if (Feature == "+mmx") { + HasMMX = true; + } else if (Feature == "+aes") { HasAES = true; } else if (Feature == "+vaes") { HasVAES = true; @@ -487,13 +489,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, // for bfloat16 arithmetic operations in the front-end. HasBFloat16 = SSELevel >= SSE2; - MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch(Feature) - .Case("+3dnowa", AMD3DNowAthlon) - .Case("+3dnow", AMD3DNow) - .Case("+mmx", MMX) - .Default(NoMMX3DNow); - MMX3DNowLevel = std::max(MMX3DNowLevel, ThreeDNowLevel); - XOPEnum XLevel = llvm::StringSwitch(Feature) .Case("+xop", XOP) .Case("+fma4", FMA4) @@ -1031,18 +1026,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, } // Each case falls through to the previous one here. - switch (MMX3DNowLevel) { - case AMD3DNowAthlon: - Builder.defineMacro("__3dNOW_A__"); - [[fallthrough]]; - case AMD3DNow: - Builder.defineMacro("__3dNOW__"); - [[fallthrough]]; - case MMX: + if (HasMMX) { Builder.defineMacro("__MMX__"); - [[fallthrough]]; - case NoMMX3DNow: - break; } if (CPU >= CK_i486 || CPU == CK_None) { @@ -1061,8 +1046,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, bool X86TargetInfo::isValidFeatureName(StringRef Name) const { return llvm::StringSwitch(Name) - .Case("3dnow", true) - .Case("3dnowa", true) .Case("adx", true) .Case("aes", true) .Case("amx-bf16", true) @@ -1232,9 +1215,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("widekl", HasWIDEKL) .Case("lwp", HasLWP) .Case("lzcnt", HasLZCNT) - .Case("mm3dnow", MMX3DNowLevel >= AMD3DNow) - .Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon) - .Case("mmx", MMX3DNowLevel >= MMX) + .Case("mmx", HasMMX) .Case("movbe", HasMOVBE) .Case("movdiri", HasMOVDIRI) .Case("movdir64b", HasMOVDIR64B) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index a70711f4ae2bb2..cdec41afd1a4b2 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -67,12 +67,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { AVX2, AVX512F } SSELevel = NoSSE; - enum MMX3DNowEnum { - NoMMX3DNow, - MMX, - AMD3DNow, - AMD3DNowAthlon - } MMX3DNowLevel = NoMMX3DNow; + bool HasMMX = false; enum XOPEnum { NoXOP, SSE4A, FMA4, XOP } XOPLevel = NoXOP; enum AddrSpace { ptr32_sptr = 270, ptr32_uptr = 271, ptr64 = 272 }; @@ -348,8 +343,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { return "avx512"; if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX) return "avx"; - if (getTriple().getArch() == llvm::Triple::x86 && - MMX3DNowLevel == NoMMX3DNow) + if (getTriple().getArch() == llvm::Triple::x86 && !HasMMX) return "no-mmx"; return ""; } diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp index e9a26abb778375..35e8f79ba1bac7 100644 --- a/clang/lib/CodeGen/ABIInfoImpl.cpp +++ b/clang/lib/CodeGen/ABIInfoImpl.cpp @@ -310,6 +310,41 @@ bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, return true; } +bool CodeGen::isEmptyFieldForLayout(const ASTContext &Context, + const FieldDecl *FD) { + if (FD->isZeroLengthBitField(Context)) + return true; + + if (FD->isUnnamedBitField()) + return false; + + return isEmptyRecordForLayout(Context, FD->getType()); +} + +bool CodeGen::isEmptyRecordForLayout(const ASTContext &Context, QualType T) { + const RecordType *RT = T->getAs(); + if (!RT) + return false; + + const RecordDecl *RD = RT->getDecl(); + + // If this is a C++ record, check the bases first. + if (const CXXRecordDecl *CXXRD = dyn_cast(RD)) { + if (CXXRD->isDynamicClass()) + return false; + + for (const auto &I : CXXRD->bases()) + if (!isEmptyRecordForLayout(Context, I.getType())) + return false; + } + + for (const auto *I : RD->fields()) + if (!isEmptyFieldForLayout(Context, I)) + return false; + + return true; +} + const Type *CodeGen::isSingleElementStruct(QualType T, ASTContext &Context) { const RecordType *RT = T->getAs(); if (!RT) diff --git a/clang/lib/CodeGen/ABIInfoImpl.h b/clang/lib/CodeGen/ABIInfoImpl.h index 92986fb4316465..2a3ef6b8a6c961 100644 --- a/clang/lib/CodeGen/ABIInfoImpl.h +++ b/clang/lib/CodeGen/ABIInfoImpl.h @@ -137,6 +137,16 @@ bool isEmptyField(ASTContext &Context, const FieldDecl *FD, bool AllowArrays, bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr = false); +/// isEmptyFieldForLayout - Return true iff the field is "empty", that is, +/// either a zero-width bit-field or an \ref isEmptyRecordForLayout. +bool isEmptyFieldForLayout(const ASTContext &Context, const FieldDecl *FD); + +/// isEmptyRecordForLayout - Return true iff a structure contains only empty +/// base classes (per \ref isEmptyRecordForLayout) and fields (per +/// \ref isEmptyFieldForLayout). Note, C++ record fields are considered empty +/// if the [[no_unique_address]] attribute would have made them empty. +bool isEmptyRecordForLayout(const ASTContext &Context, QualType T); + /// isSingleElementStruct - Determine if a structure is a "single /// element struct", i.e. it has exactly one non-empty field or /// exactly one field which is itself a single element diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6cc0d9485720c0..67027f8aa93f33 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2587,6 +2587,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_fma: case Builtin::BI__builtin_fmaf: case Builtin::BI__builtin_fmal: + case Builtin::BI__builtin_fmaf16: case Builtin::BIfma: case Builtin::BIfmaf: case Builtin::BIfmal: { @@ -15968,14 +15969,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {Ops[0]}); } - // 3DNow! - case X86::BI__builtin_ia32_pswapdsf: - case X86::BI__builtin_ia32_pswapdsi: { - llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext()); - Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast"); - llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd); - return Builder.CreateCall(F, Ops, "pswapd"); - } case X86::BI__builtin_ia32_rdrand16_step: case X86::BI__builtin_ia32_rdrand32_step: case X86::BI__builtin_ia32_rdrand64_step: @@ -18388,13 +18381,12 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, llvm_unreachable("rcp operand must have a float representation"); llvm::Type *Ty = Op0->getType(); llvm::Type *EltTy = Ty->getScalarType(); - Constant *One = - Ty->isVectorTy() - ? ConstantVector::getSplat( - ElementCount::getFixed( - dyn_cast(Ty)->getNumElements()), - ConstantFP::get(EltTy, 1.0)) - : ConstantFP::get(EltTy, 1.0); + Constant *One = Ty->isVectorTy() + ? ConstantVector::getSplat( + ElementCount::getFixed( + cast(Ty)->getNumElements()), + ConstantFP::get(EltTy, 1.0)) + : ConstantFP::get(EltTy, 1.0); return Builder.CreateFDiv(One, Op0, "hlsl.rcp"); } case Builtin::BI__builtin_hlsl_elementwise_rsqrt: { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index a38484941ba246..d582aba679ddc4 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -3863,7 +3863,8 @@ void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI, LValue ArgVal = LValue::MakeAddr(ArgAddr, RetTy, getContext(), BaseInfo, TBAAInfo); EmitStoreOfScalar( - Builder.CreateLoad(ReturnValue), ArgVal, /*isInit*/ true); + EmitLoadOfScalar(MakeAddrLValue(ReturnValue, RetTy), EndLoc), ArgVal, + /*isInit*/ true); break; } } diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index 0a595bb998d261..667e260f2228dc 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "ABIInfoImpl.h" #include "CGBlocks.h" #include "CGCXXABI.h" #include "CGDebugInfo.h" @@ -933,7 +934,7 @@ namespace { } void addMemcpyableField(FieldDecl *F) { - if (F->isZeroSize(CGF.getContext())) + if (isEmptyFieldForLayout(CGF.getContext(), F)) return; if (!FirstField) addInitialField(F); @@ -1815,7 +1816,7 @@ namespace { const CXXDestructorDecl *DD) : Context(Context), EHStack(EHStack), DD(DD), StartIndex(std::nullopt) {} void PushCleanupForField(const FieldDecl *Field) { - if (Field->isZeroSize(Context)) + if (isEmptyFieldForLayout(Context, Field)) return; unsigned FieldIndex = Field->getFieldIndex(); if (FieldHasTrivialDestructorBody(Context, Field)) { diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 90aa4c0745a8ab..c3251bb5ab5657 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -33,6 +33,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Sema/Sema.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalVariable.h" @@ -1969,6 +1970,17 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) { constant = constWithPadding(CGM, IsPattern::No, replaceUndef(CGM, isPattern, constant)); } + + if (D.getType()->isBitIntType() && + CGM.getTypes().typeRequiresSplitIntoByteArray(D.getType())) { + // Constants for long _BitInt types are split into individual bytes. + // Try to fold these back into an integer constant so it can be stored + // properly. + llvm::Type *LoadType = CGM.getTypes().convertTypeForLoadStore( + D.getType(), constant->getType()); + constant = llvm::ConstantFoldLoadFromConst( + constant, LoadType, llvm::APInt::getZero(32), CGM.getDataLayout()); + } } if (!constant) { diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 039f60c7745918..5fdd3cc490e593 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "ABIInfoImpl.h" #include "CGCUDARuntime.h" #include "CGCXXABI.h" #include "CGCall.h" @@ -1986,6 +1987,9 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, return EmitAtomicLoad(AtomicLValue, Loc).getScalarVal(); } + Addr = + Addr.withElementType(convertTypeForLoadStore(Ty, Addr.getElementType())); + llvm::LoadInst *Load = Builder.CreateLoad(Addr, Volatile); if (isNontemporal) { llvm::MDNode *Node = llvm::MDNode::get( @@ -2008,27 +2012,33 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, return EmitFromMemory(Load, Ty); } +/// Converts a scalar value from its primary IR type (as returned +/// by ConvertType) to its load/store type (as returned by +/// convertTypeForLoadStore). llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) { - // Bool has a different representation in memory than in registers. - if (hasBooleanRepresentation(Ty)) { - // This should really always be an i1, but sometimes it's already - // an i8, and it's awkward to track those cases down. - if (Value->getType()->isIntegerTy(1)) - return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "frombool"); - assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) && - "wrong value rep of bool"); + if (hasBooleanRepresentation(Ty) || Ty->isBitIntType()) { + llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); + bool Signed = Ty->isSignedIntegerOrEnumerationType(); + return Builder.CreateIntCast(Value, StoreTy, Signed, "storedv"); + } + + if (Ty->isExtVectorBoolType()) { + llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); + // Expand to the memory bit width. + unsigned MemNumElems = StoreTy->getPrimitiveSizeInBits(); + // -->

    . + Value = emitBoolVecConversion(Value, MemNumElems, "insertvec"); + //

    --> iP. + Value = Builder.CreateBitCast(Value, StoreTy); } return Value; } +/// Converts a scalar value from its load/store type (as returned +/// by convertTypeForLoadStore) to its primary IR type (as returned +/// by ConvertType). llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) { - // Bool has a different representation in memory than in registers. - if (hasBooleanRepresentation(Ty)) { - assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) && - "wrong value rep of bool"); - return Builder.CreateTrunc(Value, Builder.getInt1Ty(), "tobool"); - } if (Ty->isExtVectorBoolType()) { const auto *RawIntTy = Value->getType(); // Bitcast iP -->

    . @@ -2041,6 +2051,11 @@ llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) { return emitBoolVecConversion(V, ValNumElems, "extractvec"); } + if (hasBooleanRepresentation(Ty) || Ty->isBitIntType()) { + llvm::Type *ResTy = ConvertType(Ty); + return Builder.CreateTrunc(Value, ResTy, "loadedv"); + } + return Value; } @@ -2093,17 +2108,10 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr, llvm::Type *SrcTy = Value->getType(); if (const auto *ClangVecTy = Ty->getAs()) { auto *VecTy = dyn_cast(SrcTy); - if (VecTy && ClangVecTy->isExtVectorBoolType()) { - auto *MemIntTy = cast(Addr.getElementType()); - // Expand to the memory bit width. - unsigned MemNumElems = MemIntTy->getPrimitiveSizeInBits(); - // -->

    . - Value = emitBoolVecConversion(Value, MemNumElems, "insertvec"); - //

    --> iP. - Value = Builder.CreateBitCast(Value, MemIntTy); - } else if (!CGM.getCodeGenOpts().PreserveVec3Type) { + if (!CGM.getCodeGenOpts().PreserveVec3Type) { // Handle vec3 special. - if (VecTy && cast(VecTy)->getNumElements() == 3) { + if (VecTy && !ClangVecTy->isExtVectorBoolType() && + cast(VecTy)->getNumElements() == 3) { // Our source is a vec3, do a shuffle vector to make it a vec4. Value = Builder.CreateShuffleVector(Value, ArrayRef{0, 1, 2, -1}, "extractVec"); @@ -2477,7 +2485,7 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst, void CodeGenFunction::EmitStoreThroughBitfieldLValue(RValue Src, LValue Dst, llvm::Value **Result) { const CGBitFieldInfo &Info = Dst.getBitFieldInfo(); - llvm::Type *ResLTy = ConvertTypeForMem(Dst.getType()); + llvm::Type *ResLTy = convertTypeForLoadStore(Dst.getType()); Address Ptr = Dst.getBitFieldAddress(); // Get the source value, truncated to the width of the bit-field. @@ -3842,9 +3850,10 @@ void CodeGenFunction::EmitTrapCheck(llvm::Value *Checked, llvm::CallInst *TrapCall = Builder.CreateCall( CGM.getIntrinsic(llvm::Intrinsic::ubsantrap), - llvm::ConstantInt::get(CGM.Int8Ty, ClSanitizeDebugDeoptimization - ? TrapBB->getParent()->size() - : CheckHandlerID)); + llvm::ConstantInt::get(CGM.Int8Ty, + ClSanitizeDebugDeoptimization + ? TrapBB->getParent()->size() + : static_cast(CheckHandlerID))); if (!CGM.getCodeGenOpts().TrapFuncName.empty()) { auto A = llvm::Attribute::get(getLLVMContext(), "trap-func-name", @@ -4749,7 +4758,7 @@ static Address emitAddrOfZeroSizeField(CodeGenFunction &CGF, Address Base, /// The resulting address doesn't necessarily have the right type. static Address emitAddrOfFieldStorage(CodeGenFunction &CGF, Address base, const FieldDecl *field) { - if (field->isZeroSize(CGF.getContext())) + if (isEmptyFieldForLayout(CGF.getContext(), field)) return emitAddrOfZeroSizeField(CGF, base, field); const RecordDecl *rec = field->getParent(); diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp index 00a5a7e6898a81..7c65fccb608551 100644 --- a/clang/lib/CodeGen/CGExprConstant.cpp +++ b/clang/lib/CodeGen/CGExprConstant.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "ABIInfoImpl.h" #include "CGCXXABI.h" #include "CGObjCRuntime.h" #include "CGRecordLayout.h" @@ -585,7 +586,7 @@ class ConstStructBuilder { bool AllowOverwrite = false); bool AppendBitField(const FieldDecl *Field, uint64_t FieldOffset, - llvm::ConstantInt *InitExpr, bool AllowOverwrite = false); + llvm::Constant *InitExpr, bool AllowOverwrite = false); bool Build(const InitListExpr *ILE, bool AllowOverwrite); bool Build(const APValue &Val, const RecordDecl *RD, bool IsPrimaryBase, @@ -609,9 +610,25 @@ bool ConstStructBuilder::AppendBytes(CharUnits FieldOffsetInChars, return Builder.add(InitCst, StartOffset + FieldOffsetInChars, AllowOverwrite); } -bool ConstStructBuilder::AppendBitField( - const FieldDecl *Field, uint64_t FieldOffset, llvm::ConstantInt *CI, - bool AllowOverwrite) { +bool ConstStructBuilder::AppendBitField(const FieldDecl *Field, + uint64_t FieldOffset, llvm::Constant *C, + bool AllowOverwrite) { + + llvm::ConstantInt *CI = dyn_cast(C); + if (!CI) { + // Constants for long _BitInt types are sometimes split into individual + // bytes. Try to fold these back into an integer constant. If that doesn't + // work out, then we are trying to initialize a bitfield with a non-trivial + // constant, this must require run-time code. + llvm::Type *LoadType = + CGM.getTypes().convertTypeForLoadStore(Field->getType(), C->getType()); + llvm::Constant *FoldedConstant = llvm::ConstantFoldLoadFromConst( + C, LoadType, llvm::APInt::getZero(32), CGM.getDataLayout()); + CI = dyn_cast_if_present(FoldedConstant); + if (!CI) + return false; + } + const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(Field->getParent()); const CGBitFieldInfo &Info = RL.getBitFieldInfo(Field); @@ -720,7 +737,7 @@ bool ConstStructBuilder::Build(const InitListExpr *ILE, bool AllowOverwrite) { // Zero-sized fields are not emitted, but their initializers may still // prevent emission of this struct as a constant. - if (Field->isZeroSize(CGM.getContext())) { + if (isEmptyFieldForLayout(CGM.getContext(), Field)) { if (Init->HasSideEffects(CGM.getContext())) return false; continue; @@ -762,15 +779,9 @@ bool ConstStructBuilder::Build(const InitListExpr *ILE, bool AllowOverwrite) { AllowOverwrite = true; } else { // Otherwise we have a bitfield. - if (auto *CI = dyn_cast(EltInit)) { - if (!AppendBitField(Field, Layout.getFieldOffset(FieldNo), CI, - AllowOverwrite)) - return false; - } else { - // We are trying to initialize a bitfield with a non-trivial constant, - // this must require run-time code. + if (!AppendBitField(Field, Layout.getFieldOffset(FieldNo), EltInit, + AllowOverwrite)) return false; - } } } @@ -848,7 +859,8 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD, continue; // Don't emit anonymous bitfields or zero-sized fields. - if (Field->isUnnamedBitField() || Field->isZeroSize(CGM.getContext())) + if (Field->isUnnamedBitField() || + isEmptyFieldForLayout(CGM.getContext(), *Field)) continue; // Emit the value of the initializer. @@ -871,7 +883,7 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD, } else { // Otherwise we have a bitfield. if (!AppendBitField(*Field, Layout.getFieldOffset(FieldNo) + OffsetBits, - cast(EltInit), AllowOverwrite)) + EltInit, AllowOverwrite)) return false; } } @@ -1888,6 +1900,27 @@ llvm::Constant *ConstantEmitter::emitForMemory(CodeGenModule &CGM, return Res; } + if (destType->isBitIntType()) { + ConstantAggregateBuilder Builder(CGM); + llvm::Type *LoadStoreTy = CGM.getTypes().convertTypeForLoadStore(destType); + // ptrtoint/inttoptr should not involve _BitInt in constant expressions, so + // casting to ConstantInt is safe here. + auto *CI = cast(C); + llvm::Constant *Res = llvm::ConstantFoldCastOperand( + destType->isSignedIntegerOrEnumerationType() ? llvm::Instruction::SExt + : llvm::Instruction::ZExt, + CI, LoadStoreTy, CGM.getDataLayout()); + if (CGM.getTypes().typeRequiresSplitIntoByteArray(destType, C->getType())) { + // Long _BitInt has array of bytes as in-memory type. + // So, split constant into individual bytes. + llvm::Type *DesiredTy = CGM.getTypes().ConvertTypeForMem(destType); + llvm::APInt Value = cast(Res)->getValue(); + Builder.addBits(Value, /*OffsetInBits=*/0, /*AllowOverwrite=*/false); + return Builder.build(DesiredTy, /*AllowOversized*/ false); + } + return Res; + } + return C; } @@ -2495,8 +2528,10 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM, cast(I.getType()->castAs()->getDecl()); // Ignore empty bases. - if (base->isEmpty() || - CGM.getContext().getASTRecordLayout(base).getNonVirtualSize() + if (isEmptyRecordForLayout(CGM.getContext(), I.getType()) || + CGM.getContext() + .getASTRecordLayout(base) + .getNonVirtualSize() .isZero()) continue; @@ -2510,7 +2545,8 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM, for (const auto *Field : record->fields()) { // Fill in non-bitfields. (Bitfields always use a zero pattern, which we // will fill in later.) - if (!Field->isBitField() && !Field->isZeroSize(CGM.getContext())) { + if (!Field->isBitField() && + !isEmptyFieldForLayout(CGM.getContext(), Field)) { unsigned fieldIndex = layout.getLLVMFieldNo(Field); elements[fieldIndex] = CGM.EmitNullConstant(Field->getType()); } @@ -2532,7 +2568,7 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM, cast(I.getType()->castAs()->getDecl()); // Ignore empty bases. - if (base->isEmpty()) + if (isEmptyRecordForLayout(CGM.getContext(), I.getType())) continue; unsigned fieldIndex = layout.getVirtualBaseIndex(base); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index f40f3c273206bb..084dc54537eb7a 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -436,9 +436,10 @@ class ScalarExprEmitter if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) { if (E->isGLValue()) - return CGF.Builder.CreateLoad(Address( - Result, CGF.ConvertTypeForMem(E->getType()), - CGF.getContext().getTypeAlignInChars(E->getType()))); + return CGF.EmitLoadOfScalar( + Address(Result, CGF.convertTypeForLoadStore(E->getType()), + CGF.getContext().getTypeAlignInChars(E->getType())), + /*Volatile*/ false, E->getType(), E->getExprLoc()); return Result; } return Visit(E->getSubExpr()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 8bc202f402aa39..652fb700fc6af7 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntime.h" +#include "ABIInfoImpl.h" #include "CGCXXABI.h" #include "CGCleanup.h" #include "CGRecordLayout.h" @@ -7729,12 +7730,15 @@ class MappableExprsHandler { for (const auto &I : RD->bases()) { if (I.isVirtual()) continue; - const auto *Base = I.getType()->getAsCXXRecordDecl(); + + QualType BaseTy = I.getType(); + const auto *Base = BaseTy->getAsCXXRecordDecl(); // Ignore empty bases. - if (Base->isEmpty() || CGF.getContext() - .getASTRecordLayout(Base) - .getNonVirtualSize() - .isZero()) + if (isEmptyRecordForLayout(CGF.getContext(), BaseTy) || + CGF.getContext() + .getASTRecordLayout(Base) + .getNonVirtualSize() + .isZero()) continue; unsigned FieldIndex = RL.getNonVirtualBaseLLVMFieldNo(Base); @@ -7742,10 +7746,12 @@ class MappableExprsHandler { } // Fill in virtual bases. for (const auto &I : RD->vbases()) { - const auto *Base = I.getType()->getAsCXXRecordDecl(); + QualType BaseTy = I.getType(); // Ignore empty bases. - if (Base->isEmpty()) + if (isEmptyRecordForLayout(CGF.getContext(), BaseTy)) continue; + + const auto *Base = BaseTy->getAsCXXRecordDecl(); unsigned FieldIndex = RL.getVirtualBaseIndex(Base); if (RecordLayout[FieldIndex]) continue; @@ -7756,7 +7762,8 @@ class MappableExprsHandler { for (const auto *Field : RD->fields()) { // Fill in non-bitfields. (Bitfields always use a zero pattern, which we // will fill in later.) - if (!Field->isBitField() && !Field->isZeroSize(CGF.getContext())) { + if (!Field->isBitField() && + !isEmptyFieldForLayout(CGF.getContext(), Field)) { unsigned FieldIndex = RL.getLLVMFieldNo(Field); RecordLayout[FieldIndex] = Field; } diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp index 5169be204c14d0..ea44e6f21f3c86 100644 --- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp +++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp @@ -10,8 +10,9 @@ // //===----------------------------------------------------------------------===// -#include "CGRecordLayout.h" +#include "ABIInfoImpl.h" #include "CGCXXABI.h" +#include "CGRecordLayout.h" #include "CodeGenTypes.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Attr.h" @@ -384,7 +385,7 @@ void CGRecordLowering::accumulateFields(bool isNonVirtualBaseType) { Field = accumulateBitFields(isNonVirtualBaseType, Field, FieldEnd); assert((Field == FieldEnd || !Field->isBitField()) && "Failed to accumulate all the bitfields"); - } else if (Field->isZeroSize(Context)) { + } else if (isEmptyFieldForLayout(Context, *Field)) { // Empty fields have no storage. ++Field; } else { @@ -427,8 +428,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, continue; } uint64_t BitOffset = getFieldBitOffset(*Field); - llvm::Type *Type = - Types.ConvertTypeForMem(Field->getType(), /*ForBitField=*/true); + llvm::Type *Type = Types.ConvertTypeForMem(Field->getType()); // If we don't have a run yet, or don't live within the previous run's // allocated storage then we allocate some storage and start a new run. if (Run == FieldEnd || BitOffset >= Tail) { @@ -634,7 +634,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // non-reusable tail padding. CharUnits LimitOffset; for (auto Probe = Field; Probe != FieldEnd; ++Probe) - if (!Probe->isZeroSize(Context)) { + if (!isEmptyFieldForLayout(Context, *Probe)) { // A member with storage sets the limit. assert((getFieldBitOffset(*Probe) % CharBits) == 0 && "Next storage is not byte-aligned"); @@ -732,7 +732,7 @@ void CGRecordLowering::accumulateBases() { // Bases can be zero-sized even if not technically empty if they // contain only a trailing array member. const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); - if (!BaseDecl->isEmpty() && + if (!isEmptyRecordForLayout(Context, Base.getType()) && !Context.getASTRecordLayout(BaseDecl).getNonVirtualSize().isZero()) Members.push_back(MemberInfo(Layout.getBaseClassOffset(BaseDecl), MemberInfo::Base, getStorageType(BaseDecl), BaseDecl)); @@ -880,7 +880,7 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const { if (!isNonVirtualBaseType && isOverlappingVBaseABI()) for (const auto &Base : RD->vbases()) { const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); - if (BaseDecl->isEmpty()) + if (isEmptyRecordForLayout(Context, Base.getType())) continue; // If the vbase is a primary virtual base of some base, then it doesn't // get its own storage location but instead lives inside of that base. @@ -896,7 +896,7 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const { void CGRecordLowering::accumulateVBases() { for (const auto &Base : RD->vbases()) { const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); - if (BaseDecl->isEmpty()) + if (isEmptyRecordForLayout(Context, Base.getType())) continue; CharUnits Offset = Layout.getVBaseClassOffset(BaseDecl); // If the vbase is a primary virtual base of some base, then it doesn't @@ -1162,7 +1162,7 @@ CodeGenTypes::ComputeRecordLayout(const RecordDecl *D, llvm::StructType *Ty) { const FieldDecl *FD = *it; // Ignore zero-sized fields. - if (FD->isZeroSize(getContext())) + if (isEmptyFieldForLayout(getContext(), FD)) continue; // For non-bit-fields, just check that the LLVM struct offset matches the diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 39222c0e65353b..2e65e9fd26099a 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -1537,9 +1537,15 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) { Builder.CreateStore(Result.getScalarVal(), ReturnValue); } else { switch (getEvaluationKind(RV->getType())) { - case TEK_Scalar: - Builder.CreateStore(EmitScalarExpr(RV), ReturnValue); + case TEK_Scalar: { + llvm::Value *Ret = EmitScalarExpr(RV); + if (CurFnInfo->getReturnInfo().getKind() == ABIArgInfo::Indirect) + EmitStoreOfScalar(Ret, MakeAddrLValue(ReturnValue, RV->getType()), + /*isInit*/ true); + else + Builder.CreateStore(Ret, ReturnValue); break; + } case TEK_Complex: EmitComplexExprIntoLValue(RV, MakeAddrLValue(ReturnValue, RV->getType()), /*isInit*/ true); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 26deeca95d326c..ea4635c039cb28 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -233,6 +233,11 @@ llvm::Type *CodeGenFunction::ConvertType(QualType T) { return CGM.getTypes().ConvertType(T); } +llvm::Type *CodeGenFunction::convertTypeForLoadStore(QualType ASTTy, + llvm::Type *LLVMTy) { + return CGM.getTypes().convertTypeForLoadStore(ASTTy, LLVMTy); +} + TypeEvaluationKind CodeGenFunction::getEvaluationKind(QualType type) { type = type.getCanonicalType(); while (true) { diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index e33267c4787fdb..1aac2ee9a5c901 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2576,6 +2576,8 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Type *ConvertTypeForMem(QualType T); llvm::Type *ConvertType(QualType T); + llvm::Type *convertTypeForLoadStore(QualType ASTTy, + llvm::Type *LLVMTy = nullptr); llvm::Type *ConvertType(const TypeDecl *T) { return ConvertType(getContext().getTypeDeclType(T)); } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 08cfa694cfb81f..0c002b553e4c60 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -116,6 +116,8 @@ createTargetCodeGenInfo(CodeGenModule &CGM) { default: return createDefaultTargetCodeGenInfo(CGM); + case llvm::Triple::le32: + return createPNaClTargetCodeGenInfo(CGM); case llvm::Triple::m68k: return createM68kTargetCodeGenInfo(CGM); case llvm::Triple::mips: @@ -721,6 +723,11 @@ void CodeGenModule::checkAliases() { cast(Alias)->setAliasee(Aliasee); } } + // ifunc resolvers are usually implemented to run before sanitizer + // initialization. Disable instrumentation to prevent the ordering issue. + if (IsIFunc) + cast(Aliasee)->addFnAttr( + llvm::Attribute::DisableSanitizerInstrumentation); } if (!Error) return; @@ -6106,11 +6113,14 @@ void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) { Aliases.push_back(GD); - llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType()); - llvm::Type *ResolverTy = llvm::GlobalIFunc::getResolverFunctionType(DeclTy); + // The resolver might not be visited yet. Specify a dummy non-function type to + // indicate IsIncompleteFunction. Either the type is ignored (if the resolver + // was emitted) or the whole function will be replaced (if the resolver has + // not been emitted). llvm::Constant *Resolver = - GetOrCreateLLVMFunction(IFA->getResolver(), ResolverTy, {}, + GetOrCreateLLVMFunction(IFA->getResolver(), VoidTy, {}, /*ForVTable=*/false); + llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType()); llvm::GlobalIFunc *GIF = llvm::GlobalIFunc::create(DeclTy, 0, llvm::Function::ExternalLinkage, "", Resolver, &getModule()); @@ -6134,9 +6144,6 @@ void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) { Entry->eraseFromParent(); } else GIF->setName(MangledName); - if (auto *F = dyn_cast(Resolver)) { - F->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation); - } SetCommonAttributes(GD, GIF); } diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index 284421f494711e..b889d1bfe003f3 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "CodeGenTBAA.h" +#include "ABIInfoImpl.h" #include "CGRecordLayout.h" #include "CodeGenTypes.h" #include "clang/AST/ASTContext.h" @@ -309,7 +310,7 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset, unsigned idx = 0; for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end(); i != e; ++i, ++idx) { - if ((*i)->isZeroSize(Context)) + if (isEmptyFieldForLayout(Context, *i)) continue; uint64_t Offset = diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index d823c336e39bf0..e0f567c5da342e 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -89,7 +89,14 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD, /// ConvertType in that it is used to convert to the memory representation for /// a type. For example, the scalar representation for _Bool is i1, but the /// memory representation is usually i8 or i32, depending on the target. -llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T, bool ForBitField) { +/// +/// We generally assume that the alloc size of this type under the LLVM +/// data layout is the same as the size of the AST type. The alignment +/// does not have to match: Clang should always use explicit alignments +/// and packed structs as necessary to produce the layout it needs. +/// But the size does need to be exactly right or else things like struct +/// layout will break. +llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) { if (T->isConstantMatrixType()) { const Type *Ty = Context.getCanonicalType(T).getTypePtr(); const ConstantMatrixType *MT = cast(Ty); @@ -107,10 +114,28 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T, bool ForBitField) { return llvm::IntegerType::get(FixedVT->getContext(), BytePadded); } - // If this is a bool type, or a bit-precise integer type in a bitfield - // representation, map this integer to the target-specified size. - if ((ForBitField && T->isBitIntType()) || - (!T->isBitIntType() && R->isIntegerTy(1))) + // If T is _Bool or a _BitInt type, ConvertType will produce an IR type + // with the exact semantic bit-width of the AST type; for example, + // _BitInt(17) will turn into i17. In memory, however, we need to store + // such values extended to their full storage size as decided by AST + // layout; this is an ABI requirement. Ideally, we would always use an + // integer type that's just the bit-size of the AST type; for example, if + // sizeof(_BitInt(17)) == 4, _BitInt(17) would turn into i32. That is what's + // returned by convertTypeForLoadStore. However, that type does not + // always satisfy the size requirement on memory representation types + // describe above. For example, a 32-bit platform might reasonably set + // sizeof(_BitInt(65)) == 12, but i96 is likely to have to have an alloc size + // of 16 bytes in the LLVM data layout. In these cases, we simply return + // a byte array of the appropriate size. + if (T->isBitIntType()) { + if (typeRequiresSplitIntoByteArray(T, R)) + return llvm::ArrayType::get(CGM.Int8Ty, + Context.getTypeSizeInChars(T).getQuantity()); + return llvm::IntegerType::get(getLLVMContext(), + (unsigned)Context.getTypeSize(T)); + } + + if (R->isIntegerTy(1)) return llvm::IntegerType::get(getLLVMContext(), (unsigned)Context.getTypeSize(T)); @@ -118,6 +143,36 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T, bool ForBitField) { return R; } +bool CodeGenTypes::typeRequiresSplitIntoByteArray(QualType ASTTy, + llvm::Type *LLVMTy) { + if (!LLVMTy) + LLVMTy = ConvertType(ASTTy); + + CharUnits ASTSize = Context.getTypeSizeInChars(ASTTy); + CharUnits LLVMSize = + CharUnits::fromQuantity(getDataLayout().getTypeAllocSize(LLVMTy)); + return ASTSize != LLVMSize; +} + +llvm::Type *CodeGenTypes::convertTypeForLoadStore(QualType T, + llvm::Type *LLVMTy) { + if (!LLVMTy) + LLVMTy = ConvertType(T); + + if (T->isBitIntType()) + return llvm::Type::getIntNTy( + getLLVMContext(), Context.getTypeSizeInChars(T).getQuantity() * 8); + + if (LLVMTy->isIntegerTy(1)) + return llvm::IntegerType::get(getLLVMContext(), + (unsigned)Context.getTypeSize(T)); + + if (T->isExtVectorBoolType()) + return ConvertTypeForMem(T); + + return LLVMTy; +} + /// isRecordLayoutComplete - Return true if the specified type is already /// completely laid out. bool CodeGenTypes::isRecordLayoutComplete(const Type *Ty) const { diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h index 01c0c673795c0f..cbda2628e9140f 100644 --- a/clang/lib/CodeGen/CodeGenTypes.h +++ b/clang/lib/CodeGen/CodeGenTypes.h @@ -126,7 +126,30 @@ class CodeGenTypes { /// ConvertType in that it is used to convert to the memory representation for /// a type. For example, the scalar representation for _Bool is i1, but the /// memory representation is usually i8 or i32, depending on the target. - llvm::Type *ConvertTypeForMem(QualType T, bool ForBitField = false); + llvm::Type *ConvertTypeForMem(QualType T); + + /// Check whether the given type needs to be laid out in memory + /// using an opaque byte-array type because its load/store type + /// does not have the correct alloc size in the LLVM data layout. + /// If this is false, the load/store type (convertTypeForLoadStore) + /// and memory representation type (ConvertTypeForMem) will + /// be the same type. + bool typeRequiresSplitIntoByteArray(QualType ASTTy, + llvm::Type *LLVMTy = nullptr); + + /// Given that T is a scalar type, return the IR type that should + /// be used for load and store operations. For example, this might + /// be i8 for _Bool or i96 for _BitInt(65). The store size of the + /// load/store type (as reported by LLVM's data layout) is always + /// the same as the alloc size of the memory representation type + /// returned by ConvertTypeForMem. + /// + /// As an optimization, if you already know the scalar value type + /// for T (as would be returned by ConvertType), you can pass + /// it as the second argument so that it does not need to be + /// recomputed in common cases where the value type and + /// load/store type are the same. + llvm::Type *convertTypeForLoadStore(QualType T, llvm::Type *LLVMTy = nullptr); /// GetFunctionType - Get the LLVM function type for \arg Info. llvm::FunctionType *GetFunctionType(const CGFunctionInfo &Info); diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index c823820747de77..489051e95953e8 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -576,6 +576,13 @@ CodeGen::CGCXXABI *CodeGen::CreateItaniumCXXABI(CodeGenModule &CGM) { return new XLCXXABI(CGM); case TargetCXXABI::GenericItanium: + if (CGM.getContext().getTargetInfo().getTriple().getArch() + == llvm::Triple::le32) { + // For PNaCl, use ARM-style method pointers so that PNaCl code + // does not assume anything about the alignment of function + // pointers. + return new ItaniumCXXABI(CGM, /*UseARMMethodPtrABI=*/true); + } return new ItaniumCXXABI(CGM); case TargetCXXABI::Microsoft: diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index 1831a4113fcd08..149a31f58e75d2 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -77,7 +77,8 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple, if (!getArchFeatures(D, MArch, Features, Args)) return; - bool CPUFastUnaligned = false; + bool CPUFastScalarUnaligned = false; + bool CPUFastVectorUnaligned = false; // If users give march and mcpu, get std extension feature from MArch // and other features (ex. mirco architecture feature) from mcpu @@ -88,8 +89,10 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple, getRISCFeaturesFromMcpu(D, A, Triple, CPU, Features); - if (llvm::RISCV::hasFastUnalignedAccess(CPU)) - CPUFastUnaligned = true; + if (llvm::RISCV::hasFastScalarUnalignedAccess(CPU)) + CPUFastScalarUnaligned = true; + if (llvm::RISCV::hasFastVectorUnalignedAccess(CPU)) + CPUFastVectorUnaligned = true; } // Handle features corresponding to "-ffixed-X" options @@ -169,20 +172,37 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple, Features.push_back("-relax"); } - // If -mstrict-align or -mno-strict-align is passed, use it. Otherwise, the - // unaligned-*-mem is enabled if the CPU supports it or the target is + // If -mstrict-align, -mno-strict-align, -mscalar-strict-align, or + // -mno-scalar-strict-align is passed, use it. Otherwise, the + // unaligned-scalar-mem is enabled if the CPU supports it or the target is // Android. - if (const Arg *A = Args.getLastArg(options::OPT_mno_strict_align, - options::OPT_mstrict_align)) { - if (A->getOption().matches(options::OPT_mno_strict_align)) { + if (const Arg *A = Args.getLastArg( + options::OPT_mno_strict_align, options::OPT_mscalar_strict_align, + options::OPT_mstrict_align, options::OPT_mno_scalar_strict_align)) { + if (A->getOption().matches(options::OPT_mno_strict_align) || + A->getOption().matches(options::OPT_mno_scalar_strict_align)) { Features.push_back("+unaligned-scalar-mem"); - Features.push_back("+unaligned-vector-mem"); } else { Features.push_back("-unaligned-scalar-mem"); - Features.push_back("-unaligned-vector-mem"); } - } else if (CPUFastUnaligned || Triple.isAndroid()) { + } else if (CPUFastScalarUnaligned || Triple.isAndroid()) { Features.push_back("+unaligned-scalar-mem"); + } + + // If -mstrict-align, -mno-strict-align, -mvector-strict-align, or + // -mno-vector-strict-align is passed, use it. Otherwise, the + // unaligned-vector-mem is enabled if the CPU supports it or the target is + // Android. + if (const Arg *A = Args.getLastArg( + options::OPT_mno_strict_align, options::OPT_mvector_strict_align, + options::OPT_mstrict_align, options::OPT_mno_vector_strict_align)) { + if (A->getOption().matches(options::OPT_mno_strict_align) || + A->getOption().matches(options::OPT_mno_vector_strict_align)) { + Features.push_back("+unaligned-vector-mem"); + } else { + Features.push_back("-unaligned-vector-mem"); + } + } else if (CPUFastVectorUnaligned || Triple.isAndroid()) { Features.push_back("+unaligned-vector-mem"); } @@ -290,8 +310,24 @@ std::string riscv::getRISCVArch(const llvm::opt::ArgList &Args, // 2. Get march (isa string) based on `-mcpu=` if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef CPU = A->getValue(); - if (CPU == "native") + if (CPU == "native") { CPU = llvm::sys::getHostCPUName(); + // If the target cpu is unrecognized, use target features. + if (CPU.starts_with("generic")) { + auto FeatureMap = llvm::sys::getHostCPUFeatures(); + // hwprobe may be unavailable on older Linux versions. + if (!FeatureMap.empty()) { + std::vector Features; + for (auto &F : FeatureMap) + Features.push_back(((F.second ? "+" : "-") + F.first()).str()); + auto ParseResult = llvm::RISCVISAInfo::parseFeatures( + Triple.isRISCV32() ? 32 : 64, Features); + if (ParseResult) + return (*ParseResult)->toString(); + } + } + } + StringRef MArch = llvm::RISCV::getMArchFromMcpu(CPU); // Bypass if target cpu's default march is empty. if (MArch != "") diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 9fca7864b2546c..2f63333b732f61 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -310,4 +310,17 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, Features.push_back("+prefer-no-scatter"); if (Args.hasArg(options::OPT_mapx_inline_asm_use_gpr32)) Features.push_back("+inline-asm-use-gpr32"); + + // Warn for removed 3dnow support + if (const Arg *A = + Args.getLastArg(options::OPT_m3dnowa, options::OPT_mno_3dnowa, + options::OPT_mno_3dnow)) { + if (A->getOption().matches(options::OPT_m3dnowa)) + D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args); + } + if (const Arg *A = + Args.getLastArg(options::OPT_m3dnow, options::OPT_mno_3dnow)) { + if (A->getOption().matches(options::OPT_m3dnow)) + D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args); + } } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index bc21d03a627b9c..a8a7cef09972e7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2545,6 +2545,13 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, switch (C.getDefaultToolChain().getArch()) { default: break; + case llvm::Triple::x86: + case llvm::Triple::x86_64: + if (Value == "-msse2avx") { + CmdArgs.push_back("-msse2avx"); + continue; + } + break; case llvm::Triple::wasm32: case llvm::Triple::wasm64: if (Value == "--no-type-check") { @@ -3815,6 +3822,12 @@ static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T, if (UseBuiltins) A->render(Args, CmdArgs); } + + // le32-specific flags: + // -fno-math-builtin: clang should not convert math builtins to intrinsics + // by default. + if (TC.getArch() == llvm::Triple::le32) + CmdArgs.push_back("-fno-math-builtin"); } bool Driver::getDefaultModuleCachePath(SmallVectorImpl &Result) { @@ -5819,7 +5832,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } // If toolchain choose to use MCAsmParser for inline asm don't pass the - // option to disable integrated-as explictly. + // option to disable integrated-as explicitly. if (!TC.useIntegratedAs() && !TC.parseInlineAsmUsingAsmParser()) CmdArgs.push_back("-no-integrated-as"); diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 939d89bb1e2fec..c4f2375c64034b 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -15,6 +15,7 @@ #include "llvm/Frontend/Debug/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/RISCVISAInfo.h" #include "llvm/TargetParser/RISCVTargetParser.h" @@ -419,6 +420,13 @@ void Flang::addTargetOptions(const ArgList &Args, } // TODO: Add target specific flags, ABI, mtune option etc. + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { + CmdArgs.push_back("-tune-cpu"); + if (A->getValue() == StringRef{"native"}) + CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName())); + else + CmdArgs.push_back(A->getValue()); + } } void Flang::addOffloadOptions(Compilation &C, const InputInfoList &Inputs, @@ -815,7 +823,7 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, case CodeGenOptions::FramePointerKind::None: FPKeepKindStr = "-mframe-pointer=none"; break; - case CodeGenOptions::FramePointerKind::Reserved: + case CodeGenOptions::FramePointerKind::Reserved: FPKeepKindStr = "-mframe-pointer=reserved"; break; case CodeGenOptions::FramePointerKind::NonLeaf: diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 3fd62d97930937..974e486a0082bc 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -118,11 +118,11 @@ void toolchains::PS5CPU::addSanitizerArgs(const ArgList &Args, CmdArgs.push_back(arg("SceThreadSanitizer_nosubmission_stub_weak")); } -void tools::PScpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, - const InputInfo &Output, - const InputInfoList &Inputs, - const ArgList &Args, - const char *LinkingOutput) const { +void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { auto &TC = static_cast(getToolChain()); const Driver &D = TC.getDriver(); ArgStringList CmdArgs; @@ -155,14 +155,120 @@ void tools::PScpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, const bool UseLTO = D.isUsingLTO(); const bool UseJMC = Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false); - const bool IsPS4 = TC.getTriple().isPS4(); - const char *PS4LTOArgs = ""; + const char *LTOArgs = ""; auto AddCodeGenFlag = [&](Twine Flag) { - if (IsPS4) - PS4LTOArgs = Args.MakeArgString(Twine(PS4LTOArgs) + " " + Flag); + LTOArgs = Args.MakeArgString(Twine(LTOArgs) + " " + Flag); + }; + + if (UseLTO) { + // We default to creating the arange section, but LTO does not. Enable it + // here. + AddCodeGenFlag("-generate-arange-section"); + + // This tells LTO to perform JustMyCode instrumentation. + if (UseJMC) + AddCodeGenFlag("-enable-jmc-instrument"); + + if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir)) + AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue()); + + StringRef Parallelism = getLTOParallelism(Args, D); + if (!Parallelism.empty()) + AddCodeGenFlag(Twine("-threads=") + Parallelism); + + const char *Prefix = nullptr; + if (D.getLTOMode() == LTOK_Thin) + Prefix = "-lto-thin-debug-options="; + else if (D.getLTOMode() == LTOK_Full) + Prefix = "-lto-debug-options="; else - CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=") + Flag)); + llvm_unreachable("new LTO mode?"); + + CmdArgs.push_back(Args.MakeArgString(Twine(Prefix) + LTOArgs)); + } + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) + TC.addSanitizerArgs(Args, CmdArgs, "-l", ""); + + if (D.isUsingLTO() && Args.hasArg(options::OPT_funified_lto)) { + if (D.getLTOMode() == LTOK_Thin) + CmdArgs.push_back("--lto=thin"); + else if (D.getLTOMode() == LTOK_Full) + CmdArgs.push_back("--lto=full"); + } + + Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group, + options::OPT_s, options::OPT_t}); + + if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle)) + CmdArgs.push_back("--no-demangle"); + + AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA); + + if (Args.hasArg(options::OPT_pthread)) { + CmdArgs.push_back("-lpthread"); + } + + if (UseJMC) { + CmdArgs.push_back("--whole-archive"); + CmdArgs.push_back("-lSceDbgJmc"); + CmdArgs.push_back("--no-whole-archive"); + } + + if (Args.hasArg(options::OPT_fuse_ld_EQ)) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << "-fuse-ld" << TC.getTriple().str(); + } + + std::string LdName = TC.qualifyPSCmdName(TC.getLinkerBaseName()); + const char *Exec = Args.MakeArgString(TC.GetProgramPath(LdName.c_str())); + + C.addCommand(std::make_unique(JA, *this, + ResponseFileSupport::AtFileUTF8(), + Exec, CmdArgs, Inputs, Output)); +} + +void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + auto &TC = static_cast(getToolChain()); + const Driver &D = TC.getDriver(); + ArgStringList CmdArgs; + + // Silence warning for "clang -g foo.o -o foo" + Args.ClaimAllArgs(options::OPT_g_Group); + // and "clang -emit-llvm foo.o -o foo" + Args.ClaimAllArgs(options::OPT_emit_llvm); + // and for "clang -w foo.o -o foo". Other warning options are already + // handled somewhere else. + Args.ClaimAllArgs(options::OPT_w); + + if (!D.SysRoot.empty()) + CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); + + if (Args.hasArg(options::OPT_pie)) + CmdArgs.push_back("-pie"); + + if (Args.hasArg(options::OPT_rdynamic)) + CmdArgs.push_back("-export-dynamic"); + if (Args.hasArg(options::OPT_shared)) + CmdArgs.push_back("--shared"); + + assert((Output.isFilename() || Output.isNothing()) && "Invalid output."); + if (Output.isFilename()) { + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + } + + const bool UseLTO = D.isUsingLTO(); + const bool UseJMC = + Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false); + + auto AddCodeGenFlag = [&](Twine Flag) { + CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=") + Flag)); }; if (UseLTO) { @@ -178,24 +284,8 @@ void tools::PScpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue()); StringRef Parallelism = getLTOParallelism(Args, D); - if (!Parallelism.empty()) { - if (IsPS4) - AddCodeGenFlag(Twine("-threads=") + Parallelism); - else - CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=jobs=") + Parallelism)); - } - - if (IsPS4) { - const char *Prefix = nullptr; - if (D.getLTOMode() == LTOK_Thin) - Prefix = "-lto-thin-debug-options="; - else if (D.getLTOMode() == LTOK_Full) - Prefix = "-lto-debug-options="; - else - llvm_unreachable("new LTO mode?"); - - CmdArgs.push_back(Args.MakeArgString(Twine(Prefix) + PS4LTOArgs)); - } + if (!Parallelism.empty()) + CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=jobs=") + Parallelism)); } if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) @@ -222,10 +312,7 @@ void tools::PScpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (UseJMC) { CmdArgs.push_back("--whole-archive"); - if (IsPS4) - CmdArgs.push_back("-lSceDbgJmc"); - else - CmdArgs.push_back("-lSceJmc_nosubmission"); + CmdArgs.push_back("-lSceJmc_nosubmission"); CmdArgs.push_back("--no-whole-archive"); } @@ -321,14 +408,18 @@ Tool *toolchains::PS4CPU::buildAssembler() const { return new tools::PScpu::Assembler(*this); } +Tool *toolchains::PS4CPU::buildLinker() const { + return new tools::PS4cpu::Linker(*this); +} + Tool *toolchains::PS5CPU::buildAssembler() const { // PS5 does not support an external assembler. getDriver().Diag(clang::diag::err_no_external_assembler); return nullptr; } -Tool *toolchains::PS4PS5Base::buildLinker() const { - return new tools::PScpu::Linker(*this); +Tool *toolchains::PS5CPU::buildLinker() const { + return new tools::PS5cpu::Linker(*this); } SanitizerMask toolchains::PS4PS5Base::getSupportedSanitizers() const { diff --git a/clang/lib/Driver/ToolChains/PS4CPU.h b/clang/lib/Driver/ToolChains/PS4CPU.h index fee80e77462f39..0be90183c637c8 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.h +++ b/clang/lib/Driver/ToolChains/PS4CPU.h @@ -38,10 +38,12 @@ class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool { const llvm::opt::ArgList &TCArgs, const char *LinkingOutput) const override; }; +} // namespace PScpu +namespace PS4cpu { class LLVM_LIBRARY_VISIBILITY Linker final : public Tool { public: - Linker(const ToolChain &TC) : Tool("PScpu::Linker", "linker", TC) {} + Linker(const ToolChain &TC) : Tool("PS4cpu::Linker", "linker", TC) {} bool hasIntegratedCPP() const override { return false; } bool isLinkJob() const override { return true; } @@ -51,7 +53,23 @@ class LLVM_LIBRARY_VISIBILITY Linker final : public Tool { const llvm::opt::ArgList &TCArgs, const char *LinkingOutput) const override; }; -} // namespace PScpu +} // namespace PS4cpu + +namespace PS5cpu { +class LLVM_LIBRARY_VISIBILITY Linker final : public Tool { +public: + Linker(const ToolChain &TC) : Tool("PS5cpu::Linker", "linker", TC) {} + + bool hasIntegratedCPP() const override { return false; } + bool isLinkJob() const override { return true; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; +} // namespace PS5cpu + } // namespace tools namespace toolchains { @@ -110,9 +128,6 @@ class LLVM_LIBRARY_VISIBILITY PS4PS5Base : public Generic_ELF { const char *Suffix) const = 0; virtual const char *getProfileRTLibName() const = 0; -protected: - Tool *buildLinker() const override; - private: // We compute the SDK root dir in the ctor, and use it later. std::string SDKRootDir; @@ -143,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY PS4CPU : public PS4PS5Base { protected: Tool *buildAssembler() const override; + Tool *buildLinker() const override; }; // PS5-specific Toolchain class. @@ -168,6 +184,7 @@ class LLVM_LIBRARY_VISIBILITY PS5CPU : public PS4PS5Base { protected: Tool *buildAssembler() const override; + Tool *buildLinker() const override; }; } // end namespace toolchains diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp index 8c7c0f8a147260..6b85c7db90349e 100644 --- a/clang/lib/ExtractAPI/DeclarationFragments.cpp +++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp @@ -710,7 +710,8 @@ DeclarationFragmentsBuilder::getFragmentsForFunction(const FunctionDecl *Func) { Fragments.append(std::move(ReturnValueFragment)) .appendSpace() - .append(Func->getName(), DeclarationFragments::FragmentKind::Identifier); + .append(Func->getNameAsString(), + DeclarationFragments::FragmentKind::Identifier); if (Func->getTemplateSpecializationInfo()) { Fragments.append("<", DeclarationFragments::FragmentKind::Text); @@ -1610,9 +1611,12 @@ DeclarationFragmentsBuilder::getSubHeading(const NamedDecl *Decl) { cast(Decl)->isOverloadedOperator()) { Fragments.append(Decl->getNameAsString(), DeclarationFragments::FragmentKind::Identifier); - } else if (!Decl->getName().empty()) + } else if (Decl->getIdentifier()) { Fragments.append(Decl->getName(), DeclarationFragments::FragmentKind::Identifier); + } else + Fragments.append(Decl->getDeclName().getAsString(), + DeclarationFragments::FragmentKind::Identifier); return Fragments; } diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 1fd309afd697ef..b6d6e52ccb8f89 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4713,14 +4713,13 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if (Right.is(TT_OverloadedOperatorLParen)) return spaceRequiredBeforeParens(Right); // Function declaration or definition - if (Line.MightBeFunctionDecl && (Left.is(TT_FunctionDeclarationName))) { - if (Line.mightBeFunctionDefinition()) { - return Style.SpaceBeforeParensOptions.AfterFunctionDefinitionName || - spaceRequiredBeforeParens(Right); - } else { - return Style.SpaceBeforeParensOptions.AfterFunctionDeclarationName || - spaceRequiredBeforeParens(Right); - } + if (Line.MightBeFunctionDecl && Right.is(TT_FunctionDeclarationLParen)) { + if (spaceRequiredBeforeParens(Right)) + return true; + const auto &Options = Style.SpaceBeforeParensOptions; + return Line.mightBeFunctionDefinition() + ? Options.AfterFunctionDefinitionName + : Options.AfterFunctionDeclarationName; } // Lambda if (Line.Type != LT_PreprocessorDirective && Left.is(tok::r_square) && diff --git a/clang/lib/Headers/mm3dnow.h b/clang/lib/Headers/mm3dnow.h index 22ab13aa334098..afffba3a9c75eb 100644 --- a/clang/lib/Headers/mm3dnow.h +++ b/clang/lib/Headers/mm3dnow.h @@ -7,151 +7,16 @@ *===-----------------------------------------------------------------------=== */ +// 3dNow intrinsics are no longer supported. + #ifndef _MM3DNOW_H_INCLUDED #define _MM3DNOW_H_INCLUDED +#ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS +#warning "The header is deprecated, and 3dNow! intrinsics are unsupported. For other intrinsics, include , instead." +#endif + #include #include -typedef float __v2sf __attribute__((__vector_size__(8))); - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64))) - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow"))) -_m_femms(void) { - __builtin_ia32_femms(); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pavgusb(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pf2id(__m64 __m) { - return (__m64)__builtin_ia32_pf2id((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfadd(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpeq(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpge(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpgt(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmax(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmin(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmul(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcp(__m64 __m) { - return (__m64)__builtin_ia32_pfrcp((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcpit1(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcpit2(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrsqrt(__m64 __m) { - return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrsqrtit1(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfsub(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfsubr(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pi2fd(__m64 __m) { - return (__m64)__builtin_ia32_pi2fd((__v2si)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pmulhrw(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2); -} - -/* Handle the 3dnowa instructions here. */ -#undef __DEFAULT_FN_ATTRS -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64))) - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pf2iw(__m64 __m) { - return (__m64)__builtin_ia32_pf2iw((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfnacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfpnacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pi2fw(__m64 __m) { - return (__m64)__builtin_ia32_pi2fw((__v2si)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pswapdsf(__m64 __m) { - return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pswapdsi(__m64 __m) { - return (__m64)__builtin_ia32_pswapdsi((__v2si)__m); -} - -#undef __DEFAULT_FN_ATTRS - #endif diff --git a/clang/lib/Headers/x86intrin.h b/clang/lib/Headers/x86intrin.h index c20bfbb8fe46e2..f42e9e580f883a 100644 --- a/clang/lib/Headers/x86intrin.h +++ b/clang/lib/Headers/x86intrin.h @@ -14,10 +14,6 @@ #include -#if !defined(__SCE__) || __has_feature(modules) || defined(__3dNOW__) -#include -#endif - #if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__) #include #endif diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 310a38b2cd7866..3d7c58e5b3c3cd 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -1099,6 +1099,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, // primary-expression case tok::numeric_constant: + case tok::binary_data: // constant: integer-constant // constant: floating-constant @@ -1148,18 +1149,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, } case tok::annot_embed: { - // We've met #embed in a context where a single value is expected. Take last - // element from #embed data as if it were a comma expression. - EmbedAnnotationData *Data = - reinterpret_cast(Tok.getAnnotationValue()); - SourceLocation StartLoc = ConsumeAnnotationToken(); - ASTContext &Context = Actions.getASTContext(); - Res = IntegerLiteral::Create(Context, - llvm::APInt(CHAR_BIT, Data->BinaryData.back()), - Context.UnsignedCharTy, StartLoc); - if (Data->BinaryData.size() > 1) - Diag(StartLoc, diag::warn_unused_comma_left_operand); - break; + injectEmbedTokens(); + return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast, + isVectorLiteral, NotPrimaryExpression); } case tok::kw___super: @@ -2348,9 +2340,10 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { } if (!LHS.isInvalid()) - LHS = Actions.ActOnMemberAccessExpr( - getCurScope(), LHS.get(), OpLoc, OpKind, SS, TemplateKWLoc, Name, - CurParsedObjCImpl ? CurParsedObjCImpl->Dcl : nullptr); + LHS = Actions.ActOnMemberAccessExpr(getCurScope(), LHS.get(), OpLoc, + OpKind, SS, TemplateKWLoc, Name, + CurParsedObjCImpl ? CurParsedObjCImpl->Dcl + : nullptr); if (!LHS.isInvalid()) { if (Tok.is(tok::less)) checkPotentialAngleBracket(LHS); @@ -3583,15 +3576,29 @@ ExprResult Parser::ParseFoldExpression(ExprResult LHS, T.getCloseLocation()); } -void Parser::ExpandEmbedDirective(SmallVectorImpl &Exprs) { +void Parser::injectEmbedTokens() { EmbedAnnotationData *Data = reinterpret_cast(Tok.getAnnotationValue()); - SourceLocation StartLoc = ConsumeAnnotationToken(); - ASTContext &Context = Actions.getASTContext(); - for (auto Byte : Data->BinaryData) { - Exprs.push_back(IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte), - Context.UnsignedCharTy, StartLoc)); - } + MutableArrayRef Toks(PP.getPreprocessorAllocator().Allocate( + Data->BinaryData.size() * 2 - 1), + Data->BinaryData.size() * 2 - 1); + unsigned I = 0; + for (auto &Byte : Data->BinaryData) { + Toks[I].startToken(); + Toks[I].setKind(tok::binary_data); + Toks[I].setLocation(Tok.getLocation()); + Toks[I].setLength(1); + Toks[I].setLiteralData(&Byte); + if (I != ((Data->BinaryData.size() - 1) * 2)) { + Toks[I + 1].startToken(); + Toks[I + 1].setKind(tok::comma); + Toks[I + 1].setLocation(Tok.getLocation()); + } + I += 2; + } + PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true, + /*IsReinject=*/false); + ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true); } /// ParseExpressionList - Used for C/C++ (argument-)expression-list. @@ -3629,17 +3636,8 @@ bool Parser::ParseExpressionList(SmallVectorImpl &Exprs, if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) { Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists); Expr = ParseBraceInitializer(); - } else if (Tok.is(tok::annot_embed)) { - ExpandEmbedDirective(Exprs); - if (Tok.isNot(tok::comma)) - break; - Token Comma = Tok; - ConsumeToken(); - checkPotentialAngleBracketDelimiter(Comma); - continue; - } else { + } else Expr = ParseAssignmentExpression(); - } if (EarlyTypoCorrection) Expr = Actions.CorrectDelayedTyposInExpr(Expr); diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 17be090dea3fd1..1d364f77a81464 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -100,8 +100,7 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, bool MemberOfUnknownSpecialization; if (!Actions.isTemplateName(getCurScope(), SS, /*hasTemplateKeyword=*/false, TemplateName, ObjectType, EnteringContext, - Template, MemberOfUnknownSpecialization, - /*Disambiguation=*/false, /*MayBeNNS=*/true)) + Template, MemberOfUnknownSpecialization)) return; FixDigraph(*this, PP, Next, SecondToken, tok::unknown, @@ -354,8 +353,7 @@ bool Parser::ParseOptionalCXXScopeSpecifier( TemplateTy Template; TemplateNameKind TNK = Actions.ActOnTemplateName( getCurScope(), SS, TemplateKWLoc, TemplateName, ObjectType, - EnteringContext, Template, /*AllowInjectedClassName*/ true, - /*MayBeNNS=*/true); + EnteringContext, Template, /*AllowInjectedClassName*/ true); if (AnnotateTemplateIdToken(Template, TNK, SS, TemplateKWLoc, TemplateName, false)) return true; @@ -407,6 +405,7 @@ bool Parser::ParseOptionalCXXScopeSpecifier( : TemplateId->TemplateNameLoc; SS.SetInvalid(SourceRange(StartLoc, CCLoc)); } + continue; } @@ -529,19 +528,18 @@ bool Parser::ParseOptionalCXXScopeSpecifier( UnqualifiedId TemplateName; TemplateName.setIdentifier(&II, Tok.getLocation()); bool MemberOfUnknownSpecialization; - if (TemplateNameKind TNK = Actions.isTemplateName( - getCurScope(), SS, - /*hasTemplateKeyword=*/false, TemplateName, ObjectType, - EnteringContext, Template, MemberOfUnknownSpecialization, - /*Disambiguation=*/false, - /*MayBeNNS=*/true)) { + if (TemplateNameKind TNK = Actions.isTemplateName(getCurScope(), SS, + /*hasTemplateKeyword=*/false, + TemplateName, + ObjectType, + EnteringContext, + Template, + MemberOfUnknownSpecialization)) { // If lookup didn't find anything, we treat the name as a template-name // anyway. C++20 requires this, and in prior language modes it improves // error recovery. But before we commit to this, check that we actually // have something that looks like a template-argument-list next. - if (!IsTypename && - (TNK == TNK_Undeclared_template || - (!HasScopeSpecifier && ObjectType)) && + if (!IsTypename && TNK == TNK_Undeclared_template && isTemplateArgumentList(1) == TPResult::False) break; @@ -568,7 +566,11 @@ bool Parser::ParseOptionalCXXScopeSpecifier( // member of an unknown specialization. However, this will only // parse correctly as a template, so suggest the keyword 'template' // before 'getAs' and treat this as a dependent template name. - Diag(Tok.getLocation(), diag::ext_missing_dependent_template_keyword) + unsigned DiagID = diag::err_missing_dependent_template_keyword; + if (getLangOpts().MicrosoftExt) + DiagID = diag::warn_missing_dependent_template_keyword; + + Diag(Tok.getLocation(), DiagID) << II.getName() << FixItHint::CreateInsertion(Tok.getLocation(), "template "); } @@ -1918,12 +1920,12 @@ Parser::ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, // argument list. This affects examples such as // void f(auto *p) { p->~X(); } // ... but there's no ambiguity, and nowhere to write 'template' in such an - // example, so we accept it anyway - if (Tok.is(tok::less) && ParseUnqualifiedIdTemplateId( - SS, ObjectType, Base && Base->containsErrors(), - /*TemplateKWLoc=*/SourceLocation(), TildeLoc, - Name, NameLoc, false, SecondTypeName, - /*AssumeTemplateId=*/true)) + // example, so we accept it anyway. + if (Tok.is(tok::less) && + ParseUnqualifiedIdTemplateId( + SS, ObjectType, Base && Base->containsErrors(), SourceLocation(), + Name, NameLoc, false, SecondTypeName, + /*AssumeTemplateId=*/true)) return ExprError(); return Actions.ActOnPseudoDestructorExpr(getCurScope(), Base, OpLoc, OpKind, @@ -2530,9 +2532,8 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS, DeclaratorContext Context) { /// \returns true if a parse error occurred, false otherwise. bool Parser::ParseUnqualifiedIdTemplateId( CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, - SourceLocation TemplateKWLoc, SourceLocation TildeLoc, IdentifierInfo *Name, - SourceLocation NameLoc, bool EnteringContext, UnqualifiedId &Id, - bool AssumeTemplateId) { + SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, + bool EnteringContext, UnqualifiedId &Id, bool AssumeTemplateId) { assert(Tok.is(tok::less) && "Expected '<' to finish parsing a template-id"); TemplateTy Template; @@ -2546,14 +2547,13 @@ bool Parser::ParseUnqualifiedIdTemplateId( // this template-id is used to form a nested-name-specifier or not. TNK = Actions.ActOnTemplateName(getCurScope(), SS, TemplateKWLoc, Id, ObjectType, EnteringContext, Template, - /*AllowInjectedClassName=*/true, - TildeLoc.isValid()); + /*AllowInjectedClassName*/ true); } else { bool MemberOfUnknownSpecialization; - TNK = Actions.isTemplateName( - getCurScope(), SS, TemplateKWLoc.isValid(), Id, ObjectType, - EnteringContext, Template, MemberOfUnknownSpecialization, - /*Disambiguation=*/false, TildeLoc.isValid()); + TNK = Actions.isTemplateName(getCurScope(), SS, + TemplateKWLoc.isValid(), Id, + ObjectType, EnteringContext, Template, + MemberOfUnknownSpecialization); // If lookup found nothing but we're assuming that this is a template // name, double-check that makes sense syntactically before committing // to it. @@ -2580,13 +2580,13 @@ bool Parser::ParseUnqualifiedIdTemplateId( else Name += Id.Identifier->getName(); } - Diag(Id.StartLocation, diag::ext_missing_dependent_template_keyword) + Diag(Id.StartLocation, diag::err_missing_dependent_template_keyword) << Name << FixItHint::CreateInsertion(Id.StartLocation, "template "); } TNK = Actions.ActOnTemplateName( getCurScope(), SS, TemplateKWLoc, Id, ObjectType, EnteringContext, - Template, /*AllowInjectedClassName=*/true, TildeLoc.isValid()); + Template, /*AllowInjectedClassName*/ true); } else if (TNK == TNK_Non_template) { return false; } @@ -2611,16 +2611,14 @@ bool Parser::ParseUnqualifiedIdTemplateId( bool MemberOfUnknownSpecialization; TemplateName.setIdentifier(Name, NameLoc); if (ObjectType) { - TNK = Actions.ActOnTemplateName(getCurScope(), SS, TemplateKWLoc, - TemplateName, ObjectType, EnteringContext, - Template, /*AllowInjectedClassName=*/true, - /*MayBeNNS=*/true); + TNK = Actions.ActOnTemplateName( + getCurScope(), SS, TemplateKWLoc, TemplateName, ObjectType, + EnteringContext, Template, /*AllowInjectedClassName*/ true); } else { TNK = Actions.isTemplateName(getCurScope(), SS, TemplateKWLoc.isValid(), - TemplateName, ObjectType, EnteringContext, - Template, MemberOfUnknownSpecialization, - /*Disambiguation=*/false, - /*MayBeNNS=*/true); + TemplateName, ObjectType, + EnteringContext, Template, + MemberOfUnknownSpecialization); if (TNK == TNK_Non_template && !Id.DestructorName.get()) { Diag(NameLoc, diag::err_destructor_template_id) @@ -2682,7 +2680,7 @@ bool Parser::ParseUnqualifiedIdTemplateId( if (Id.getKind() == UnqualifiedIdKind::IK_ConstructorName) Id.setConstructorName(Type.get(), NameLoc, RAngleLoc); else - Id.setDestructorName(TildeLoc, Type.get(), RAngleLoc); + Id.setDestructorName(Id.StartLocation, Type.get(), RAngleLoc); return false; } @@ -3030,9 +3028,8 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, if (Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( SS, ObjectType, ObjectHadErrors, - TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), - /*TildeLoc=*/SourceLocation(), Id, IdLoc, EnteringContext, Result, - TemplateSpecified); + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), Id, IdLoc, + EnteringContext, Result, TemplateSpecified); if (TemplateSpecified) { TemplateNameKind TNK = @@ -3127,15 +3124,13 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( SS, ObjectType, ObjectHadErrors, - TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), - /*TildeLoc=*/SourceLocation(), /*Name=*/nullptr, - /*NameLoc=*/SourceLocation(), EnteringContext, Result, - TemplateSpecified); + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), nullptr, + SourceLocation(), EnteringContext, Result, TemplateSpecified); else if (TemplateSpecified && Actions.ActOnTemplateName( getCurScope(), SS, *TemplateKWLoc, Result, ObjectType, EnteringContext, Template, - /*AllowInjectedClassName=*/true) == TNK_Non_template) + /*AllowInjectedClassName*/ true) == TNK_Non_template) return true; return false; @@ -3225,8 +3220,8 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, Result.setDestructorName(TildeLoc, nullptr, ClassNameLoc); return ParseUnqualifiedIdTemplateId( SS, ObjectType, ObjectHadErrors, - TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), TildeLoc, - ClassName, ClassNameLoc, EnteringContext, Result, TemplateSpecified); + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), ClassName, + ClassNameLoc, EnteringContext, Result, TemplateSpecified); } // Note that this is a destructor name. diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index 7e30afa2c64a4f..a5130f56600e54 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -1523,19 +1523,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() { ExprArg.get(), Loc); } -void Parser::ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs) { - EmbedAnnotationData *Data = - reinterpret_cast(Tok.getAnnotationValue()); - SourceLocation StartLoc = ConsumeAnnotationToken(); - ASTContext &Context = Actions.getASTContext(); - for (auto Byte : Data->BinaryData) { - Expr *E = IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte), - Context.UnsignedCharTy, StartLoc); - TemplateArgs.push_back( - ParsedTemplateArgument(ParsedTemplateArgument::NonType, E, StartLoc)); - } -} - /// ParseTemplateArgumentList - Parse a C++ template-argument-list /// (C++ [temp.names]). Returns true if there was an error. /// @@ -1560,24 +1547,20 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs, do { PreferredType.enterFunctionArgument(Tok.getLocation(), RunSignatureHelp); - if (Tok.is(tok::annot_embed)) { - ExpandEmbedIntoTemplateArgList(TemplateArgs); - } else { - ParsedTemplateArgument Arg = ParseTemplateArgument(); - SourceLocation EllipsisLoc; - if (TryConsumeToken(tok::ellipsis, EllipsisLoc)) - Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc); - - if (Arg.isInvalid()) { - if (PP.isCodeCompletionReached() && !CalledSignatureHelp) - RunSignatureHelp(); - return true; - } - - // Save this template argument. - TemplateArgs.push_back(Arg); + ParsedTemplateArgument Arg = ParseTemplateArgument(); + SourceLocation EllipsisLoc; + if (TryConsumeToken(tok::ellipsis, EllipsisLoc)) + Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc); + + if (Arg.isInvalid()) { + if (PP.isCodeCompletionReached() && !CalledSignatureHelp) + RunSignatureHelp(); + return true; } + // Save this template argument. + TemplateArgs.push_back(Arg); + // If the next token is a comma, consume it and keep reading // arguments. } while (TryConsumeToken(tok::comma)); diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index 980a83d4431aa2..5934c8c30daf90 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -82,6 +82,7 @@ add_clang_library(clangSema SemaSystemZ.cpp SemaTemplate.cpp SemaTemplateDeduction.cpp + SemaTemplateDeductionGuide.cpp SemaTemplateInstantiate.cpp SemaTemplateInstantiateDecl.cpp SemaTemplateVariadic.cpp diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 2f9ef28da2c3e1..995e4cbadacfe2 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -39,6 +39,11 @@ enum LifetimeKind { /// This is a mem-initializer: if it would extend a temporary (other than via /// a default member initializer), the program is ill-formed. LK_MemInitializer, + + /// The lifetime of a temporary bound to this entity probably ends too soon, + /// because the entity is a pointer and we assign the address of a temporary + /// object to it. + LK_Assignment, }; using LifetimeResult = llvm::PointerIntPair; @@ -971,6 +976,8 @@ static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *ExtendingEntity, LifetimeKind LK, const AssignedEntity *AEntity, Expr *Init) { + assert((AEntity && LK == LK_Assignment) || + (InitEntity && LK != LK_Assignment)); // If this entity doesn't have an interesting lifetime, don't bother looking // for temporaries within its initializer. if (LK == LK_FullExpression) @@ -1008,19 +1015,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, return true; } } - if (AEntity) { - if (!MTE) - return false; - assert(shouldLifetimeExtendThroughPath(Path) == - PathLifetimeKind::NoExtend && - "No lifetime extension for assignments"); - if (!pathContainsInit(Path)) - SemaRef.Diag(DiagLoc, diag::warn_dangling_pointer_assignment) - << AEntity->LHS << DiagRange; - return false; - } - assert(InitEntity && "only for initialization"); switch (LK) { case LK_FullExpression: llvm_unreachable("already handled this"); @@ -1077,6 +1072,17 @@ static void checkExprLifetimeImpl(Sema &SemaRef, break; } + case LK_Assignment: { + if (!MTE) + return false; + assert(shouldLifetimeExtendThroughPath(Path) == + PathLifetimeKind::NoExtend && + "No lifetime extension for assignments"); + if (!pathContainsInit(Path)) + SemaRef.Diag(DiagLoc, diag::warn_dangling_pointer_assignment) + << AEntity->LHS << DiagRange; + return false; + } case LK_MemInitializer: { if (MTE) { // Under C++ DR1696, if a mem-initializer (or a default member @@ -1283,10 +1289,11 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, Expr *Init) { - LifetimeKind LK = LK_FullExpression; - if (Entity.LHS->getType()->isPointerType()) // builtin pointer type - LK = LK_Extended; - checkExprLifetimeImpl(SemaRef, nullptr, nullptr, LK, &Entity, Init); + if (!Entity.LHS->getType()->isPointerType()) // builtin pointer type + return; + checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr, + /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity, + Init); } } // namespace clang::sema diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 7fcf5754f9dd73..ca88d138aef5d3 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -482,12 +482,6 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() { void HLSLExternalSemaSource::defineTrivialHLSLTypes() { defineHLSLVectorAlias(); - - ResourceDecl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "Resource") - .startDefinition() - .addHandleMember(AccessSpecifier::AS_public) - .completeDefinition() - .Record; } /// Set up common members and attributes for buffer types diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp index dd61bb22e3dfa7..5b2d65247e72e5 100644 --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp @@ -356,41 +356,29 @@ bool Sema::isAcceptableNestedNameSpecifier(const NamedDecl *SD, return false; } -/// If the given nested-name-specifier begins with a bare identifier -/// (e.g., Base::), perform name lookup for that identifier as a -/// nested-name-specifier within the given scope, and return the result of that -/// name lookup. -bool Sema::LookupFirstQualifierInScope(Scope *S, NestedNameSpecifier *NNS, - UnresolvedSetImpl &R) { - if (!S) - return false; +NamedDecl *Sema::FindFirstQualifierInScope(Scope *S, NestedNameSpecifier *NNS) { + if (!S || !NNS) + return nullptr; while (NNS->getPrefix()) NNS = NNS->getPrefix(); - // FIXME: This is a rather nasty hack! Ideally we should get the results - // from LookupTemplateName/BuildCXXNestedNameSpecifier. - const IdentifierInfo *II = NNS->getAsIdentifier(); - if (!II) { - if (const auto *DTST = - dyn_cast_if_present( - NNS->getAsType())) - II = DTST->getIdentifier(); - else - return false; - } - assert(II && "Missing first qualifier in scope"); - LookupResult Found(*this, II, SourceLocation(), - NNS->getAsIdentifier() ? LookupNestedNameSpecifierName - : LookupOrdinaryName); + if (NNS->getKind() != NestedNameSpecifier::Identifier) + return nullptr; + + LookupResult Found(*this, NNS->getAsIdentifier(), SourceLocation(), + LookupNestedNameSpecifierName); LookupName(Found, S); + assert(!Found.isAmbiguous() && "Cannot handle ambiguities here yet"); - if (Found.empty()) - return false; + if (!Found.isSingleResult()) + return nullptr; - R.addAllDecls(Found.asUnresolvedSet().pairs()); - Found.suppressDiagnostics(); - return true; + NamedDecl *Result = Found.getFoundDecl(); + if (isAcceptableNestedNameSpecifier(Result)) + return Result; + + return nullptr; } namespace { @@ -419,82 +407,112 @@ class NestedNameSpecifierValidatorCCC final bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, bool EnteringContext, CXXScopeSpec &SS, + NamedDecl *ScopeLookupResult, bool ErrorRecoveryLookup, bool *IsCorrectedToColon, bool OnlyNamespace) { if (IdInfo.Identifier->isEditorPlaceholder()) return true; - if (IsCorrectedToColon) - *IsCorrectedToColon = false; - - QualType ObjectType = GetTypeFromParser(IdInfo.ObjectType); LookupResult Found(*this, IdInfo.Identifier, IdInfo.IdentifierLoc, OnlyNamespace ? LookupNamespaceName : LookupNestedNameSpecifierName); + QualType ObjectType = GetTypeFromParser(IdInfo.ObjectType); - // C++ [basic.lookup.qual.general]p3: - // Qualified name lookup in a class, namespace, or enumeration performs a - // search of the scope associated with it except as specified below. - LookupParsedName(Found, S, &SS, ObjectType, - /*AllowBuiltinCreation=*/false, EnteringContext); - - // C++ [basic.lookup.qual.general]p3: - // [...] Unless otherwise specified, a qualified name undergoes qualified - // name lookup in its lookup context from the point where it appears unless - // the lookup context either is dependent and is not the current - // instantiation or is not a class or class template. - if (Found.wasNotFoundInCurrentInstantiation()) { - // Don't speculate if we're just trying to improve error recovery. - if (ErrorRecoveryLookup) - return true; - - // The lookup context is dependent and either: - // - it is not the current instantiation, or - // - it is the current instantiation, it has at least one dependent base - // class, and qualified lookup found nothing. - // Build a dependent nested-name-specifier. We will lookup the name again - // during instantiation. - SS.Extend(Context, IdInfo.Identifier, IdInfo.IdentifierLoc, IdInfo.CCLoc); - return false; + // Determine where to perform name lookup + DeclContext *LookupCtx = nullptr; + bool isDependent = false; + if (IsCorrectedToColon) + *IsCorrectedToColon = false; + if (!ObjectType.isNull()) { + // This nested-name-specifier occurs in a member access expression, e.g., + // x->B::f, and we are looking into the type of the object. + assert(!SS.isSet() && "ObjectType and scope specifier cannot coexist"); + LookupCtx = computeDeclContext(ObjectType); + isDependent = ObjectType->isDependentType(); + } else if (SS.isSet()) { + // This nested-name-specifier occurs after another nested-name-specifier, + // so look into the context associated with the prior nested-name-specifier. + LookupCtx = computeDeclContext(SS, EnteringContext); + isDependent = isDependentScopeSpecifier(SS); + Found.setContextRange(SS.getRange()); } bool ObjectTypeSearchedInScope = false; + if (LookupCtx) { + // Perform "qualified" name lookup into the declaration context we + // computed, which is either the type of the base of a member access + // expression or the declaration context associated with a prior + // nested-name-specifier. + + // The declaration context must be complete. + if (!LookupCtx->isDependentContext() && + RequireCompleteDeclContext(SS, LookupCtx)) + return true; - // C++ [basic.lookup.qual.general]p2: - // A member-qualified name is the (unique) component name, if any, of - // - an unqualified-id or - // - a nested-name-specifier of the form type-name :: or namespace-name :: - // in the id-expression of a class member access expression. - // - // C++ [basic.lookup.qual.general]p3: - // [...] If nothing is found by qualified lookup for a member-qualified - // name that is the terminal name of a nested-name-specifier and is not - // dependent, it undergoes unqualified lookup. - // - // In 'x.A::B::y', 'A' will undergo unqualified lookup if qualified lookup - // in the type of 'x' finds nothing. If the lookup context is dependent, - // we perform the unqualified lookup in the template definition context - // and store the results so we can replicate the lookup during instantiation. - if (Found.empty() && !ObjectType.isNull()) { - if (S) { - LookupName(Found, S); - } else if (!SS.getUnqualifiedLookups().empty()) { - Found.addAllDecls(SS.getUnqualifiedLookups()); - Found.resolveKind(); + LookupQualifiedName(Found, LookupCtx); + + if (!ObjectType.isNull() && Found.empty()) { + // C++ [basic.lookup.classref]p4: + // If the id-expression in a class member access is a qualified-id of + // the form + // + // class-name-or-namespace-name::... + // + // the class-name-or-namespace-name following the . or -> operator is + // looked up both in the context of the entire postfix-expression and in + // the scope of the class of the object expression. If the name is found + // only in the scope of the class of the object expression, the name + // shall refer to a class-name. If the name is found only in the + // context of the entire postfix-expression, the name shall refer to a + // class-name or namespace-name. [...] + // + // Qualified name lookup into a class will not find a namespace-name, + // so we do not need to diagnose that case specifically. However, + // this qualified name lookup may find nothing. In that case, perform + // unqualified name lookup in the given scope (if available) or + // reconstruct the result from when name lookup was performed at template + // definition time. + if (S) + LookupName(Found, S); + else if (ScopeLookupResult) + Found.addDecl(ScopeLookupResult); + + ObjectTypeSearchedInScope = true; } - ObjectTypeSearchedInScope = true; + } else if (!isDependent) { + // Perform unqualified name lookup in the current scope. + LookupName(Found, S); } if (Found.isAmbiguous()) return true; + // If we performed lookup into a dependent context and did not find anything, + // that's fine: just build a dependent nested-name-specifier. + if (Found.empty() && isDependent && + !(LookupCtx && LookupCtx->isRecord() && + (!cast(LookupCtx)->hasDefinition() || + !cast(LookupCtx)->hasAnyDependentBases()))) { + // Don't speculate if we're just trying to improve error recovery. + if (ErrorRecoveryLookup) + return true; + + // We were not able to compute the declaration context for a dependent + // base object type or prior nested-name-specifier, so this + // nested-name-specifier refers to an unknown specialization. Just build + // a dependent nested-name-specifier. + SS.Extend(Context, IdInfo.Identifier, IdInfo.IdentifierLoc, IdInfo.CCLoc); + return false; + } + if (Found.empty() && !ErrorRecoveryLookup) { // If identifier is not found as class-name-or-namespace-name, but is found // as other entity, don't look for typos. LookupResult R(*this, Found.getLookupNameInfo(), LookupOrdinaryName); - LookupParsedName(R, S, &SS, ObjectType, - /*AllowBuiltinCreation=*/false, EnteringContext); - + if (LookupCtx) + LookupQualifiedName(R, LookupCtx); + else if (S && !isDependent) + LookupName(R, S); if (!R.empty()) { // Don't diagnose problems with this speculative lookup. R.suppressDiagnostics(); @@ -521,11 +539,6 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, } } - DeclContext *LookupCtx = - SS.isSet() - ? computeDeclContext(SS, EnteringContext) - : (!ObjectType.isNull() ? computeDeclContext(ObjectType) : nullptr); - if (Found.empty() && !ErrorRecoveryLookup && !getLangOpts().MSVCCompat) { // We haven't found anything, and we're not recovering from a // different kind of error, so look for typos. @@ -581,14 +594,14 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, // scope, reconstruct the result from the template instantiation itself. // // Note that C++11 does *not* perform this redundant lookup. - NamedDecl *OuterDecl = nullptr; + NamedDecl *OuterDecl; if (S) { LookupResult FoundOuter(*this, IdInfo.Identifier, IdInfo.IdentifierLoc, LookupNestedNameSpecifierName); LookupName(FoundOuter, S); OuterDecl = FoundOuter.getAsSingle(); - } else if (!SS.getUnqualifiedLookups().empty()) - OuterDecl = SS.getUnqualifiedLookups().front().getDecl(); + } else + OuterDecl = ScopeLookupResult; if (isAcceptableNestedNameSpecifier(OuterDecl) && OuterDecl->getCanonicalDecl() != SD->getCanonicalDecl() && @@ -766,7 +779,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, return true; return BuildCXXNestedNameSpecifier(S, IdInfo, EnteringContext, SS, - /*ErrorRecoveryLookup=*/false, + /*ScopeLookupResult=*/nullptr, false, IsCorrectedToColon, OnlyNamespace); } @@ -827,7 +840,7 @@ bool Sema::IsInvalidUnlessNestedName(Scope *S, CXXScopeSpec &SS, return false; return !BuildCXXNestedNameSpecifier(S, IdInfo, EnteringContext, SS, - /*ErrorRecoveryLookup=*/true); + /*ScopeLookupResult=*/nullptr, true); } bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 8fea7b0cf0d47c..88d4732c7d5c6a 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -6864,7 +6864,7 @@ void SemaCodeCompletion::CodeCompleteNamespaceDecl(Scope *S) { NS(Ctx->decls_begin()), NSEnd(Ctx->decls_end()); NS != NSEnd; ++NS) - OrigToLatest[NS->getOriginalNamespace()] = *NS; + OrigToLatest[NS->getFirstDecl()] = *NS; // Add the most recent definition (or extended definition) of each // namespace to the list of results. diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp index fa0eb1d2afbeee..81334c817b2af2 100644 --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -306,8 +306,8 @@ static ExprResult buildMemberCall(Sema &S, Expr *Base, SourceLocation Loc, // FIXME: Fix BuildMemberReferenceExpr to take a const CXXScopeSpec&. CXXScopeSpec SS; ExprResult Result = S.BuildMemberReferenceExpr( - Base, Base->getType(), Loc, /*IsPtr=*/false, SS, SourceLocation(), - NameInfo, /*TemplateArgs=*/nullptr, + Base, Base->getType(), Loc, /*IsPtr=*/false, SS, + SourceLocation(), nullptr, NameInfo, /*TemplateArgs=*/nullptr, /*Scope=*/nullptr); if (Result.isInvalid()) return ExprError(); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index a7da5285ad0e40..f24912cde275a9 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -1275,11 +1275,9 @@ static bool checkTupleLikeDecomposition(Sema &S, if (UseMemberGet) { // if [lookup of member get] finds at least one declaration, the // initializer is e.get(). - E = S.BuildMemberReferenceExpr(E.get(), DecompType, Loc, - /*IsArrow=*/false, - /*SS=*/CXXScopeSpec(), - /*TemplateKWLoc=*/SourceLocation(), - MemberGet, &Args, /*S=*/nullptr); + E = S.BuildMemberReferenceExpr(E.get(), DecompType, Loc, false, + CXXScopeSpec(), SourceLocation(), nullptr, + MemberGet, &Args, nullptr); if (E.isInvalid()) return true; @@ -4903,12 +4901,16 @@ BuildImplicitMemberInitializer(Sema &SemaRef, CXXConstructorDecl *Constructor, MemberLookup.addDecl(Indirect ? cast(Indirect) : cast(Field), AS_public); MemberLookup.resolveKind(); - ExprResult CtorArg = SemaRef.BuildMemberReferenceExpr( - MemberExprBase, ParamType, Loc, - /*IsArrow=*/false, SS, - /*TemplateKWLoc=*/SourceLocation(), MemberLookup, - /*TemplateArgs=*/nullptr, - /*S=*/nullptr); + ExprResult CtorArg + = SemaRef.BuildMemberReferenceExpr(MemberExprBase, + ParamType, Loc, + /*IsArrow=*/false, + SS, + /*TemplateKWLoc=*/SourceLocation(), + /*FirstQualifierInScope=*/nullptr, + MemberLookup, + /*TemplateArgs=*/nullptr, + /*S*/nullptr); if (CtorArg.isInvalid()) return true; @@ -9070,7 +9072,10 @@ ComputeDefaultedComparisonExceptionSpec(Sema &S, SourceLocation Loc, EnterExpressionEvaluationContext Context( S, Sema::ExpressionEvaluationContext::Unevaluated); - CXXRecordDecl *RD = cast(FD->getLexicalParent()); + CXXRecordDecl *RD = + cast(FD->getFriendObjectKind() == Decl::FOK_None + ? FD->getDeclContext() + : FD->getLexicalDeclContext()); SourceLocation BodyLoc = FD->getEndLoc().isValid() ? FD->getEndLoc() : FD->getLocation(); StmtResult Body = @@ -11253,6 +11258,34 @@ void Sema::CheckExplicitObjectMemberFunction(Declarator &D, D.setInvalidType(); } + // Friend declarations require some care. Consider: + // + // namespace N { + // struct A{}; + // int f(A); + // } + // + // struct S { + // struct T { + // int f(this T); + // }; + // + // friend int T::f(this T); // Allow this. + // friend int f(this S); // But disallow this. + // friend int N::f(this A); // And disallow this. + // }; + // + // Here, it seems to suffice to check whether the scope + // specifier designates a class type. + if (D.getDeclSpec().isFriendSpecified() && + !isa_and_present( + computeDeclContext(D.getCXXScopeSpec()))) { + Diag(ExplicitObjectParam->getBeginLoc(), + diag::err_explicit_object_parameter_nonmember) + << D.getSourceRange() << /*non-member=*/2 << IsLambda; + D.setInvalidType(); + } + if (IsLambda && FTI.hasMutableQualifier()) { Diag(ExplicitObjectParam->getBeginLoc(), diag::err_explicit_object_parameter_mutable) @@ -11263,10 +11296,8 @@ void Sema::CheckExplicitObjectMemberFunction(Declarator &D, return; if (!DC || !DC->isRecord()) { - Diag(ExplicitObjectParam->getLocation(), - diag::err_explicit_object_parameter_nonmember) - << D.getSourceRange() << /*non-member=*/2 << IsLambda; - D.setInvalidType(); + assert(D.isInvalidType() && "Explicit object parameter in non-member " + "should have been diagnosed already"); return; } @@ -14334,10 +14365,8 @@ class MemberBuilder: public ExprBuilder { public: Expr *build(Sema &S, SourceLocation Loc) const override { return assertNotNull(S.BuildMemberReferenceExpr( - Builder.build(S, Loc), Type, Loc, IsArrow, SS, - /*TemplateKwLoc=*/SourceLocation(), MemberLookup, - /*TemplateArgs=*/nullptr, /*S=*/nullptr) - .get()); + Builder.build(S, Loc), Type, Loc, IsArrow, SS, SourceLocation(), + nullptr, MemberLookup, nullptr, nullptr).get()); } MemberBuilder(const ExprBuilder &Builder, QualType Type, bool IsArrow, @@ -14543,11 +14572,13 @@ buildSingleCopyAssignRecursively(Sema &S, SourceLocation Loc, QualType T, Loc); // Create the reference to operator=. - ExprResult OpEqualRef = S.BuildMemberReferenceExpr( - To.build(S, Loc), T, Loc, /*IsArrow=*/false, SS, - /*TemplateKWLoc=*/SourceLocation(), OpLookup, - /*TemplateArgs=*/nullptr, /*S*/ nullptr, - /*SuppressQualifierCheck=*/true); + ExprResult OpEqualRef + = S.BuildMemberReferenceExpr(To.build(S, Loc), T, Loc, /*IsArrow=*/false, + SS, /*TemplateKWLoc=*/SourceLocation(), + /*FirstQualifierInScope=*/nullptr, + OpLookup, + /*TemplateArgs=*/nullptr, /*S*/nullptr, + /*SuppressQualifierCheck=*/true); if (OpEqualRef.isInvalid()) return StmtError(); @@ -17153,9 +17184,8 @@ bool Sema::EvaluateStaticAssertMessageAsString(Expr *Message, auto BuildExpr = [&](LookupResult &LR) { ExprResult Res = BuildMemberReferenceExpr( - Message, Message->getType(), Message->getBeginLoc(), /*IsArrow=*/false, - /*SS=*/CXXScopeSpec(), /*TemplateKWLoc=*/SourceLocation(), LR, - /*TemplateArgs=*/nullptr, /*S=*/nullptr); + Message, Message->getType(), Message->getBeginLoc(), false, + CXXScopeSpec(), SourceLocation(), nullptr, LR, nullptr, nullptr); if (Res.isInvalid()) return ExprError(); Res = BuildCallExpr(nullptr, Res.get(), Loc, std::nullopt, Loc, nullptr, diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 9940f8cb629554..0698c3fbe98d29 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2624,7 +2624,7 @@ recoverFromMSUnqualifiedLookup(Sema &S, ASTContext &Context, return CXXDependentScopeMemberExpr::Create( Context, /*This=*/nullptr, ThisType, /*IsArrow=*/true, /*Op=*/SourceLocation(), NestedNameSpecifierLoc(), TemplateKWLoc, - /*UnqualifiedLookups=*/std::nullopt, NameInfo, TemplateArgs); + /*FirstQualifierFoundInScope=*/nullptr, NameInfo, TemplateArgs); } // Synthesize a fake NNS that points to the derived class. This will @@ -3643,9 +3643,9 @@ bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero) { ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) { // Fast path for a single digit (which is quite common). A single digit // cannot have a trigraph, escaped newline, radix prefix, or suffix. - if (Tok.getLength() == 1) { + if (Tok.getLength() == 1 || Tok.getKind() == tok::binary_data) { const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok); - return ActOnIntegerConstant(Tok.getLocation(), Val-'0'); + return ActOnIntegerConstant(Tok.getLocation(), Val); } SmallString<128> SpellingBuffer; @@ -17958,17 +17958,16 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func, if (FirstInstantiation || TSK != TSK_ImplicitInstantiation || Func->isConstexpr()) { - if (Func->isConstexpr()) + if (isa(Func->getDeclContext()) && + cast(Func->getDeclContext())->isLocalClass() && + CodeSynthesisContexts.size()) + PendingLocalImplicitInstantiations.push_back( + std::make_pair(Func, PointOfInstantiation)); + else if (Func->isConstexpr()) // Do not defer instantiations of constexpr functions, to avoid the // expression evaluator needing to call back into Sema if it sees a // call to such a function. InstantiateFunctionDefinition(PointOfInstantiation, Func); - else if (isa(Func->getDeclContext()) && - cast(Func->getDeclContext()) - ->isLocalClass() && - CodeSynthesisContexts.size()) - PendingLocalImplicitInstantiations.push_back( - std::make_pair(Func, PointOfInstantiation)); else { Func->setInstantiationIsPending(true); PendingInstantiations.push_back( diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 8519618bacfee6..2070f3b7bb3a2f 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -552,9 +552,11 @@ static Decl *FindGetterSetterNameDecl(const ObjCObjectPointerType *QIdTy, } ExprResult -Sema::ActOnDependentMemberExpr(Expr *BaseExpr, QualType BaseType, bool IsArrow, - SourceLocation OpLoc, const CXXScopeSpec &SS, +Sema::ActOnDependentMemberExpr(Expr *BaseExpr, QualType BaseType, + bool IsArrow, SourceLocation OpLoc, + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo, const TemplateArgumentListInfo *TemplateArgs) { // Even in dependent contexts, try to diagnose base expressions with @@ -588,8 +590,8 @@ Sema::ActOnDependentMemberExpr(Expr *BaseExpr, QualType BaseType, bool IsArrow, // must have pointer type, and the accessed type is the pointee. return CXXDependentScopeMemberExpr::Create( Context, BaseExpr, BaseType, IsArrow, OpLoc, - SS.getWithLocInContext(Context), TemplateKWLoc, - SS.getUnqualifiedLookups(), NameInfo, TemplateArgs); + SS.getWithLocInContext(Context), TemplateKWLoc, FirstQualifierInScope, + NameInfo, TemplateArgs); } /// We know that the given qualified member reference points only to @@ -765,9 +767,8 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R, R.addDecl(ND); R.resolveKind(); return SemaRef.BuildMemberReferenceExpr( - BaseExpr, BaseExpr->getType(), OpLoc, IsArrow, SS, - /*TemplateKWLoc=*/SourceLocation(), R, /*TemplateArgs=*/nullptr, - /*S=*/nullptr); + BaseExpr, BaseExpr->getType(), OpLoc, IsArrow, SS, SourceLocation(), + nullptr, R, nullptr, nullptr); }, Sema::CTK_ErrorRecovery, DC); @@ -783,7 +784,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R, ExprResult Sema::BuildMemberReferenceExpr( Expr *Base, QualType BaseType, SourceLocation OpLoc, bool IsArrow, CXXScopeSpec &SS, SourceLocation TemplateKWLoc, - const DeclarationNameInfo &NameInfo, + NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo, const TemplateArgumentListInfo *TemplateArgs, const Scope *S, ActOnMemberAccessExtraArgs *ExtraArgs) { LookupResult R(*this, NameInfo, LookupMemberName); @@ -827,9 +828,10 @@ ExprResult Sema::BuildMemberReferenceExpr( if (SS.isInvalid()) return ExprError(); - return BuildMemberReferenceExpr(Base, BaseType, OpLoc, IsArrow, SS, - TemplateKWLoc, R, TemplateArgs, S, - /*SuppressQualifierCheck=*/false, ExtraArgs); + return BuildMemberReferenceExpr(Base, BaseType, + OpLoc, IsArrow, SS, TemplateKWLoc, + FirstQualifierInScope, R, TemplateArgs, S, + false, ExtraArgs); } ExprResult @@ -967,11 +969,17 @@ static bool IsInFnTryBlockHandler(const Scope *S) { return false; } -ExprResult Sema::BuildMemberReferenceExpr( - Expr *BaseExpr, QualType BaseExprType, SourceLocation OpLoc, bool IsArrow, - const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs, const Scope *S, - bool SuppressQualifierCheck, ActOnMemberAccessExtraArgs *ExtraArgs) { +ExprResult +Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, + SourceLocation OpLoc, bool IsArrow, + const CXXScopeSpec &SS, + SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, + LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs, + const Scope *S, + bool SuppressQualifierCheck, + ActOnMemberAccessExtraArgs *ExtraArgs) { assert(!SS.isInvalid() && "nested-name-specifier cannot be invalid"); // If the member wasn't found in the current instantiation, or if the // arrow operator was used with a dependent non-pointer object expression, @@ -981,8 +989,8 @@ ExprResult Sema::BuildMemberReferenceExpr( (SS.isSet() ? SS.getScopeRep()->isDependent() : BaseExprType->isDependentType()))) return ActOnDependentMemberExpr(BaseExpr, BaseExprType, IsArrow, OpLoc, SS, - TemplateKWLoc, R.getLookupNameInfo(), - TemplateArgs); + TemplateKWLoc, FirstQualifierInScope, + R.getLookupNameInfo(), TemplateArgs); QualType BaseType = BaseExprType; if (IsArrow) { @@ -1187,9 +1195,9 @@ ExprResult Sema::BuildMemberReferenceExpr( // Non-dependent member, but dependent template arguments. if (!VDecl.get()) - return ActOnDependentMemberExpr(BaseExpr, BaseExpr->getType(), IsArrow, - OpLoc, SS, TemplateKWLoc, MemberNameInfo, - TemplateArgs); + return ActOnDependentMemberExpr( + BaseExpr, BaseExpr->getType(), IsArrow, OpLoc, SS, TemplateKWLoc, + FirstQualifierInScope, MemberNameInfo, TemplateArgs); VarDecl *Var = cast(VDecl.get()); if (!Var->getTemplateSpecializationKind()) @@ -1755,16 +1763,15 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base, const TemplateArgumentListInfo *TemplateArgs; DecomposeUnqualifiedId(Id, TemplateArgsBuffer, NameInfo, TemplateArgs); - bool IsArrow = OpKind == tok::arrow; + + bool IsArrow = (OpKind == tok::arrow); if (getLangOpts().HLSL && IsArrow) return ExprError(Diag(OpLoc, diag::err_hlsl_operator_unsupported) << 2); - UnresolvedSet<4> UnqualifiedLookups; - if (SS.isValid() && - LookupFirstQualifierInScope(S, SS.getScopeRep(), UnqualifiedLookups)) { - SS.setUnqualifiedLookups(UnqualifiedLookups.pairs()); - } + NamedDecl *FirstQualifierInScope + = (!SS.isSet() ? nullptr : FindFirstQualifierInScope(S, SS.getScopeRep())); + // This is a postfix expression, so get rid of ParenListExprs. ExprResult Result = MaybeConvertParenListExprToParenExpr(S, Base); if (Result.isInvalid()) return ExprError(); @@ -1772,8 +1779,8 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base, ActOnMemberAccessExtraArgs ExtraArgs = {S, Id, ObjCImpDecl}; ExprResult Res = BuildMemberReferenceExpr( - Base, Base->getType(), OpLoc, IsArrow, SS, TemplateKWLoc, NameInfo, - TemplateArgs, S, &ExtraArgs); + Base, Base->getType(), OpLoc, IsArrow, SS, TemplateKWLoc, + FirstQualifierInScope, NameInfo, TemplateArgs, S, &ExtraArgs); if (!Res.isInvalid() && isa(Res.get())) CheckMemberAccessOfNoDeref(cast(Res.get())); @@ -1917,8 +1924,9 @@ Sema::BuildImplicitMemberExpr(const CXXScopeSpec &SS, baseExpr = BuildCXXThisExpr(loc, ThisTy, /*IsImplicit=*/true); } - return BuildMemberReferenceExpr(baseExpr, ThisTy, - /*OpLoc=*/SourceLocation(), - /*IsArrow=*/!getLangOpts().HLSL, SS, - TemplateKWLoc, R, TemplateArgs, S); + return BuildMemberReferenceExpr( + baseExpr, ThisTy, + /*OpLoc=*/SourceLocation(), + /*IsArrow=*/!getLangOpts().HLSL, SS, TemplateKWLoc, + /*FirstQualifierInScope=*/nullptr, R, TemplateArgs, S); } diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index 7851c5d080cf38..7a6a64529f52ec 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -2325,7 +2325,7 @@ static bool LookupQualifiedNameInUsingDirectives(Sema &S, LookupResult &R, // We have already looked into the initial namespace; seed the queue // with its using-children. for (auto *I : StartDC->using_directives()) { - NamespaceDecl *ND = I->getNominatedNamespace()->getOriginalNamespace(); + NamespaceDecl *ND = I->getNominatedNamespace()->getFirstDecl(); if (S.isVisible(I) && Visited.insert(ND).second) Queue.push_back(ND); } diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index bc03280aa8aaf9..bc6894018065f2 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -4797,6 +4797,12 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, ShouldBeInTeamsRegion, ShouldBeInLoopSimdRegion, } Recommend = NoRecommend; + + SmallVector LeafOrComposite; + ArrayRef ParentLOC = + getLeafOrCompositeConstructs(ParentRegion, LeafOrComposite); + OpenMPDirectiveKind EnclosingConstruct = ParentLOC.back(); + if (SemaRef.LangOpts.OpenMP >= 51 && Stack->isParentOrderConcurrent() && CurrentRegion != OMPD_simd && CurrentRegion != OMPD_loop && CurrentRegion != OMPD_parallel && @@ -4828,7 +4834,7 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, << (SemaRef.LangOpts.OpenMP >= 50 ? 1 : 0); return CurrentRegion != OMPD_simd; } - if (ParentRegion == OMPD_atomic) { + if (EnclosingConstruct == OMPD_atomic) { // OpenMP [2.16, Nesting of Regions] // OpenMP constructs may not be nested inside an atomic region. SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region_atomic); @@ -4839,8 +4845,7 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // Orphaned section directives are prohibited. That is, the section // directives must appear within the sections construct and must not be // encountered elsewhere in the sections region. - if (ParentRegion != OMPD_sections && - ParentRegion != OMPD_parallel_sections) { + if (EnclosingConstruct != OMPD_sections) { SemaRef.Diag(StartLoc, diag::err_omp_orphaned_section_directive) << (ParentRegion != OMPD_unknown) << getOpenMPDirectiveName(ParentRegion); @@ -4861,7 +4866,7 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, if (SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion == OMPD_loop && (BindKind == OMPC_BIND_parallel || BindKind == OMPC_BIND_teams) && (isOpenMPWorksharingDirective(ParentRegion) || - ParentRegion == OMPD_loop)) { + EnclosingConstruct == OMPD_loop)) { int ErrorMsgNumber = (BindKind == OMPC_BIND_parallel) ? 1 : 4; SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region) << true << getOpenMPDirectiveName(ParentRegion) << ErrorMsgNumber @@ -4881,27 +4886,17 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // construct-type-clause is not taskgroup must be closely nested inside an // OpenMP construct that matches the type specified in // construct-type-clause. - NestingProhibited = - !((CancelRegion == OMPD_parallel && - (ParentRegion == OMPD_parallel || - ParentRegion == OMPD_target_parallel)) || - (CancelRegion == OMPD_for && - (ParentRegion == OMPD_for || ParentRegion == OMPD_parallel_for || - ParentRegion == OMPD_target_parallel_for || - ParentRegion == OMPD_distribute_parallel_for || - ParentRegion == OMPD_teams_distribute_parallel_for || - ParentRegion == OMPD_target_teams_distribute_parallel_for)) || - (CancelRegion == OMPD_taskgroup && - (ParentRegion == OMPD_task || - (SemaRef.getLangOpts().OpenMP >= 50 && - (ParentRegion == OMPD_taskloop || - ParentRegion == OMPD_master_taskloop || - ParentRegion == OMPD_masked_taskloop || - ParentRegion == OMPD_parallel_masked_taskloop || - ParentRegion == OMPD_parallel_master_taskloop)))) || - (CancelRegion == OMPD_sections && - (ParentRegion == OMPD_section || ParentRegion == OMPD_sections || - ParentRegion == OMPD_parallel_sections))); + ArrayRef Leafs = getLeafConstructsOrSelf(ParentRegion); + if (CancelRegion == OMPD_taskgroup) { + NestingProhibited = EnclosingConstruct != OMPD_task && + (SemaRef.getLangOpts().OpenMP < 50 || + EnclosingConstruct != OMPD_taskloop); + } else if (CancelRegion == OMPD_sections) { + NestingProhibited = EnclosingConstruct != OMPD_section && + EnclosingConstruct != OMPD_sections; + } else { + NestingProhibited = CancelRegion != Leafs.back(); + } OrphanSeen = ParentRegion == OMPD_unknown; } else if (CurrentRegion == OMPD_master || CurrentRegion == OMPD_masked) { // OpenMP 5.1 [2.22, Nesting of Regions] @@ -4942,13 +4937,12 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // OpenMP 5.1 [2.22, Nesting of Regions] // A barrier region may not be closely nested inside a worksharing, loop, // task, taskloop, critical, ordered, atomic, or masked region. - NestingProhibited = - isOpenMPWorksharingDirective(ParentRegion) || - isOpenMPGenericLoopDirective(ParentRegion) || - isOpenMPTaskingDirective(ParentRegion) || ParentRegion == OMPD_master || - ParentRegion == OMPD_masked || ParentRegion == OMPD_parallel_master || - ParentRegion == OMPD_parallel_masked || ParentRegion == OMPD_critical || - ParentRegion == OMPD_ordered; + NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) || + isOpenMPGenericLoopDirective(ParentRegion) || + isOpenMPTaskingDirective(ParentRegion) || + llvm::is_contained({OMPD_masked, OMPD_master, + OMPD_critical, OMPD_ordered}, + EnclosingConstruct); } else if (isOpenMPWorksharingDirective(CurrentRegion) && !isOpenMPParallelDirective(CurrentRegion) && !isOpenMPTeamsDirective(CurrentRegion)) { @@ -4956,13 +4950,12 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // A loop region that binds to a parallel region or a worksharing region // may not be closely nested inside a worksharing, loop, task, taskloop, // critical, ordered, atomic, or masked region. - NestingProhibited = - isOpenMPWorksharingDirective(ParentRegion) || - isOpenMPGenericLoopDirective(ParentRegion) || - isOpenMPTaskingDirective(ParentRegion) || ParentRegion == OMPD_master || - ParentRegion == OMPD_masked || ParentRegion == OMPD_parallel_master || - ParentRegion == OMPD_parallel_masked || ParentRegion == OMPD_critical || - ParentRegion == OMPD_ordered; + NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) || + isOpenMPGenericLoopDirective(ParentRegion) || + isOpenMPTaskingDirective(ParentRegion) || + llvm::is_contained({OMPD_masked, OMPD_master, + OMPD_critical, OMPD_ordered}, + EnclosingConstruct); Recommend = ShouldBeInParallelRegion; } else if (CurrentRegion == OMPD_ordered) { // OpenMP [2.16, Nesting of Regions] @@ -4973,7 +4966,7 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // OpenMP [2.8.1,simd Construct, Restrictions] // An ordered construct with the simd clause is the only OpenMP construct // that can appear in the simd region. - NestingProhibited = ParentRegion == OMPD_critical || + NestingProhibited = EnclosingConstruct == OMPD_critical || isOpenMPTaskingDirective(ParentRegion) || !(isOpenMPSimdDirective(ParentRegion) || Stack->isParentOrderedRegion()); @@ -4983,22 +4976,19 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // If specified, a teams construct must be contained within a target // construct. NestingProhibited = - (SemaRef.LangOpts.OpenMP <= 45 && ParentRegion != OMPD_target) || - (SemaRef.LangOpts.OpenMP >= 50 && ParentRegion != OMPD_unknown && - ParentRegion != OMPD_target); + (SemaRef.LangOpts.OpenMP <= 45 && EnclosingConstruct != OMPD_target) || + (SemaRef.LangOpts.OpenMP >= 50 && EnclosingConstruct != OMPD_unknown && + EnclosingConstruct != OMPD_target); OrphanSeen = ParentRegion == OMPD_unknown; Recommend = ShouldBeInTargetRegion; } else if (CurrentRegion == OMPD_scan) { if (SemaRef.LangOpts.OpenMP >= 50) { - SmallVector LeafOrComposite; - std::ignore = getLeafOrCompositeConstructs(ParentRegion, LeafOrComposite); // OpenMP spec 5.0 and 5.1 require scan to be directly enclosed by for, // simd, or for simd. This has to take into account combined directives. // In 5.2 this seems to be implied by the fact that the specified // separated constructs are do, for, and simd. - OpenMPDirectiveKind Enclosing = LeafOrComposite.back(); - NestingProhibited = Enclosing != OMPD_for && Enclosing != OMPD_simd && - Enclosing != OMPD_for_simd; + NestingProhibited = !llvm::is_contained( + {OMPD_for, OMPD_simd, OMPD_for_simd}, EnclosingConstruct); } else { NestingProhibited = true; } @@ -5007,7 +4997,7 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, } if (!NestingProhibited && !isOpenMPTargetExecutionDirective(CurrentRegion) && !isOpenMPTargetDataManagementDirective(CurrentRegion) && - (ParentRegion == OMPD_teams || ParentRegion == OMPD_target_teams)) { + EnclosingConstruct == OMPD_teams) { // OpenMP [5.1, 2.22, Nesting of Regions] // distribute, distribute simd, distribute parallel worksharing-loop, // distribute parallel worksharing-loop SIMD, loop, parallel regions, @@ -5029,17 +5019,15 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, // If the bind clause is present on the loop construct and binding is // teams then the corresponding loop region must be strictly nested inside // a teams region. - NestingProhibited = BindKind == OMPC_BIND_teams && - ParentRegion != OMPD_teams && - ParentRegion != OMPD_target_teams; + NestingProhibited = + BindKind == OMPC_BIND_teams && EnclosingConstruct != OMPD_teams; Recommend = ShouldBeInTeamsRegion; } if (!NestingProhibited && isOpenMPNestingDistributeDirective(CurrentRegion)) { // OpenMP 4.5 [2.17 Nesting of Regions] // The region associated with the distribute construct must be strictly // nested inside a teams region - NestingProhibited = - (ParentRegion != OMPD_teams && ParentRegion != OMPD_target_teams); + NestingProhibited = EnclosingConstruct != OMPD_teams; Recommend = ShouldBeInTeamsRegion; } if (!NestingProhibited && @@ -9102,14 +9090,15 @@ void SemaOpenMP::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, isOpenMPSimdDirective(DKind) ? (DSAStack->hasMutipleLoops() ? OMPC_lastprivate : OMPC_linear) : OMPC_private; + auto IsOpenMPTaskloopDirective = [](OpenMPDirectiveKind DK) { + return getLeafConstructsOrSelf(DK).back() == OMPD_taskloop; + }; if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown && DVar.CKind != PredeterminedCKind && DVar.RefExpr && (getLangOpts().OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate && DVar.CKind != OMPC_private))) || - ((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop || - DKind == OMPD_master_taskloop || DKind == OMPD_masked_taskloop || - DKind == OMPD_parallel_master_taskloop || - DKind == OMPD_parallel_masked_taskloop || + ((isOpenMPWorksharingDirective(DKind) || + IsOpenMPTaskloopDirective(DKind) || isOpenMPDistributeDirective(DKind)) && !isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private && DVar.CKind != OMPC_lastprivate)) && diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index d4a48858ec4196..074062ebbb594f 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -16036,11 +16036,13 @@ Sema::BuildForRangeBeginEndCall(SourceLocation Loc, CandidateSet->clear(OverloadCandidateSet::CSK_Normal); if (!MemberLookup.empty()) { - ExprResult MemberRef = BuildMemberReferenceExpr( - Range, Range->getType(), Loc, - /*IsPtr=*/false, /*SS=*/CXXScopeSpec(), - /*TemplateKWLoc=*/SourceLocation(), MemberLookup, - /*TemplateArgs=*/nullptr, S); + ExprResult MemberRef = + BuildMemberReferenceExpr(Range, Range->getType(), Loc, + /*IsPtr=*/false, CXXScopeSpec(), + /*TemplateKWLoc=*/SourceLocation(), + /*FirstQualifierInScope=*/nullptr, + MemberLookup, + /*TemplateArgs=*/nullptr, S); if (MemberRef.isInvalid()) { *CallExpr = ExprError(); return FRS_DiagnosticIssued; diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp index da2e99b6bc00c7..32d42f3c3f3bb7 100644 --- a/clang/lib/Sema/SemaStmtAsm.cpp +++ b/clang/lib/Sema/SemaStmtAsm.cpp @@ -900,8 +900,7 @@ Sema::LookupInlineAsmVarDeclField(Expr *E, StringRef Member, return CXXDependentScopeMemberExpr::Create( Context, E, T, /*IsArrow=*/false, AsmLoc, NestedNameSpecifierLoc(), SourceLocation(), - /*UnqualifiedLookups=*/std::nullopt, NameInfo, - /*TemplateArgs=*/nullptr); + /*FirstQualifierFoundInScope=*/nullptr, NameInfo, /*TemplateArgs=*/nullptr); } const RecordType *RT = T->getAs(); @@ -924,9 +923,8 @@ Sema::LookupInlineAsmVarDeclField(Expr *E, StringRef Member, // Make an Expr to thread through OpDecl. ExprResult Result = BuildMemberReferenceExpr( - E, E->getType(), AsmLoc, /*IsArrow=*/false, /*SS=*/CXXScopeSpec(), - /*TemplateKWLoc*/ SourceLocation(), FieldResult, - /*TemplateArgs=*/nullptr, /*S=*/nullptr); + E, E->getType(), AsmLoc, /*IsArrow=*/false, CXXScopeSpec(), + SourceLocation(), nullptr, FieldResult, nullptr, nullptr); return Result; } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 9296cc66d69187..9d96201625389f 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -174,12 +174,15 @@ bool Sema::hasAnyAcceptableTemplateNames(LookupResult &R, return false; } -TemplateNameKind -Sema::isTemplateName(Scope *S, CXXScopeSpec &SS, bool hasTemplateKeyword, - const UnqualifiedId &Name, ParsedType ObjectTypePtr, - bool EnteringContext, TemplateTy &TemplateResult, - bool &MemberOfUnknownSpecialization, bool Disambiguation, - bool MayBeNNS) { +TemplateNameKind Sema::isTemplateName(Scope *S, + CXXScopeSpec &SS, + bool hasTemplateKeyword, + const UnqualifiedId &Name, + ParsedType ObjectTypePtr, + bool EnteringContext, + TemplateTy &TemplateResult, + bool &MemberOfUnknownSpecialization, + bool Disambiguation) { assert(getLangOpts().CPlusPlus && "No template names in C!"); DeclarationName TName; @@ -210,9 +213,8 @@ Sema::isTemplateName(Scope *S, CXXScopeSpec &SS, bool hasTemplateKeyword, if (LookupTemplateName(R, S, SS, ObjectType, EnteringContext, /*RequiredTemplate=*/SourceLocation(), &AssumedTemplate, - /*AllowTypoCorrection=*/!Disambiguation, MayBeNNS)) + /*AllowTypoCorrection=*/!Disambiguation)) return TNK_Non_template; - MemberOfUnknownSpecialization = R.wasNotFoundInCurrentInstantiation(); if (AssumedTemplate != AssumedTemplateKind::None) { @@ -378,7 +380,7 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS, QualType ObjectType, bool EnteringContext, RequiredTemplateKind RequiredTemplate, AssumedTemplateKind *ATK, - bool AllowTypoCorrection, bool MayBeNNS) { + bool AllowTypoCorrection) { if (ATK) *ATK = AssumedTemplateKind::None; @@ -387,89 +389,92 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS, Found.setTemplateNameLookup(true); - // Template names cannot appear inside an Objective-C class or object type - // or a vector type. - // - // FIXME: This is wrong. For example: - // - // template using Vec = T __attribute__((ext_vector_type(4))); - // Vec vi; - // vi.Vec::~Vec(); - // - // ... should be accepted but we will not treat 'Vec' as a template name - // here. The right thing to do would be to check if the name is a valid - // vector component name, and look up a template name if not. And similarly - // for lookups into Objective-C class and object types, where the same - // problem can arise. - if (!ObjectType.isNull() && (ObjectType->isVectorType() || - ObjectType->isObjCObjectOrInterfaceType())) { - Found.clear(); - return false; - } - - LookupParsedName(Found, S, &SS, ObjectType, - /*AllowBuiltinCreation=*/false, EnteringContext); + // Determine where to perform name lookup + DeclContext *LookupCtx = nullptr; + bool IsDependent = false; + if (!ObjectType.isNull()) { + // This nested-name-specifier occurs in a member access expression, e.g., + // x->B::f, and we are looking into the type of the object. + assert(SS.isEmpty() && "ObjectType and scope specifier cannot coexist"); + LookupCtx = computeDeclContext(ObjectType); + IsDependent = !LookupCtx && ObjectType->isDependentType(); + assert((IsDependent || !ObjectType->isIncompleteType() || + !ObjectType->getAs() || + ObjectType->castAs()->isBeingDefined()) && + "Caller should have completed object type"); + + // Template names cannot appear inside an Objective-C class or object type + // or a vector type. + // + // FIXME: This is wrong. For example: + // + // template using Vec = T __attribute__((ext_vector_type(4))); + // Vec vi; + // vi.Vec::~Vec(); + // + // ... should be accepted but we will not treat 'Vec' as a template name + // here. The right thing to do would be to check if the name is a valid + // vector component name, and look up a template name if not. And similarly + // for lookups into Objective-C class and object types, where the same + // problem can arise. + if (ObjectType->isObjCObjectOrInterfaceType() || + ObjectType->isVectorType()) { + Found.clear(); + return false; + } + } else if (SS.isNotEmpty()) { + // This nested-name-specifier occurs after another nested-name-specifier, + // so long into the context associated with the prior nested-name-specifier. + LookupCtx = computeDeclContext(SS, EnteringContext); + IsDependent = !LookupCtx && isDependentScopeSpecifier(SS); - // C++ [basic.lookup.qual.general]p3: - // [...] Unless otherwise specified, a qualified name undergoes qualified - // name lookup in its lookup context from the point where it appears unless - // the lookup context either is dependent and is not the current - // instantiation or is not a class or class template. - // - // The lookup context is dependent and either: - // - it is not the current instantiation, or - // - it is the current instantiation, it has at least one dependent base - // class, and qualified lookup found nothing. - // - // If this is a member-qualified name that is the terminal name of a - // nested-name-specifier, we perform unqualified lookup and store the results - // so we can replicate the lookup during instantiation. The results of the - // unqualified loookup are *not* used to determine whether '<' is interpreted - // as the delimiter of a template-argument-list. - // - // For example: - // - // template - // struct A { - // int x; - // }; - // - // template - // using B = A; - // - // template - // void f(A a, A b) { - // a.B::x; // error: missing 'template' before 'B' - // b.B::x; // ok, lookup context is not dependent - // } - if (Found.wasNotFoundInCurrentInstantiation()) - return false; + // The declaration context must be complete. + if (LookupCtx && RequireCompleteDeclContext(SS, LookupCtx)) + return true; + } bool ObjectTypeSearchedInScope = false; - - // C++ [basic.lookup.qual.general]p2: - // A member-qualified name is the (unique) component name, if any, of - // - an unqualified-id or - // - a nested-name-specifier of the form type-name :: or namespace-name :: - // in the id-expression of a class member access expression. - // - // C++ [basic.lookup.qual.general]p3: - // [...] If nothing is found by qualified lookup for a member-qualified - // name that is the terminal name of a nested-name-specifier and is not - // dependent, it undergoes unqualified lookup. - // - // In 'x.A::B::y', 'A' will undergo unqualified lookup if qualified lookup - // in the type of 'x' finds nothing. If the lookup context is dependent, - // we perform the unqualified lookup in the template definition context - // and store the results so we can replicate the lookup during instantiation. - if (MayBeNNS && Found.empty() && !ObjectType.isNull()) { - if (S) { + bool AllowFunctionTemplatesInLookup = true; + if (LookupCtx) { + // Perform "qualified" name lookup into the declaration context we + // computed, which is either the type of the base of a member access + // expression or the declaration context associated with a prior + // nested-name-specifier. + LookupQualifiedName(Found, LookupCtx); + + // FIXME: The C++ standard does not clearly specify what happens in the + // case where the object type is dependent, and implementations vary. In + // Clang, we treat a name after a . or -> as a template-name if lookup + // finds a non-dependent member or member of the current instantiation that + // is a type template, or finds no such members and lookup in the context + // of the postfix-expression finds a type template. In the latter case, the + // name is nonetheless dependent, and we may resolve it to a member of an + // unknown specialization when we come to instantiate the template. + IsDependent |= Found.wasNotFoundInCurrentInstantiation(); + } + + if (SS.isEmpty() && (ObjectType.isNull() || Found.empty())) { + // C++ [basic.lookup.classref]p1: + // In a class member access expression (5.2.5), if the . or -> token is + // immediately followed by an identifier followed by a <, the + // identifier must be looked up to determine whether the < is the + // beginning of a template argument list (14.2) or a less-than operator. + // The identifier is first looked up in the class of the object + // expression. If the identifier is not found, it is then looked up in + // the context of the entire postfix-expression and shall name a class + // template. + if (S) LookupName(Found, S); - } else if (!SS.getUnqualifiedLookups().empty()) { - Found.addAllDecls(SS.getUnqualifiedLookups()); - Found.resolveKind(); + + if (!ObjectType.isNull()) { + // FIXME: We should filter out all non-type templates here, particularly + // variable templates and concepts. But the exclusion of alias templates + // and template template parameters is a wording defect. + AllowFunctionTemplatesInLookup = false; + ObjectTypeSearchedInScope = true; } - ObjectTypeSearchedInScope = true; + + IsDependent |= Found.wasNotFoundInCurrentInstantiation(); } if (Found.isAmbiguous()) @@ -489,7 +494,7 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS, getLangOpts().CPlusPlus20 && llvm::all_of(Found, [](NamedDecl *ND) { return isa(ND->getUnderlyingDecl()); }); - if (AllFunctions || Found.empty()) { + if (AllFunctions || (Found.empty() && !IsDependent)) { // If lookup found any functions, or if this is a name that can only be // used for a function, then strongly assume this is a function // template-id. @@ -501,15 +506,11 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS, } } - if (Found.empty() && AllowTypoCorrection) { + if (Found.empty() && !IsDependent && AllowTypoCorrection) { // If we did not find any names, and this is not a disambiguation, attempt // to correct any typos. DeclarationName Name = Found.getLookupName(); Found.clear(); - DeclContext *LookupCtx = - SS.isSet() - ? computeDeclContext(SS, EnteringContext) - : (!ObjectType.isNull() ? computeDeclContext(ObjectType) : nullptr); // Simple filter callback that, for keywords, only accepts the C++ *_cast DefaultFilterCCC FilterCCC{}; FilterCCC.WantTypeSpecifiers = false; @@ -542,8 +543,13 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS, NamedDecl *ExampleLookupResult = Found.empty() ? nullptr : Found.getRepresentativeDecl(); - FilterAcceptableTemplateNames(Found); + FilterAcceptableTemplateNames(Found, AllowFunctionTemplatesInLookup); if (Found.empty()) { + if (IsDependent) { + Found.setNotFoundInCurrentInstantiation(); + return false; + } + // If a 'template' keyword was used, a lookup that finds only non-template // names is an error. if (ExampleLookupResult && RequiredTemplate) { @@ -735,7 +741,7 @@ Sema::ActOnDependentIdExpression(const CXXScopeSpec &SS, /*IsArrow=*/!Context.getLangOpts().HLSL, /*OperatorLoc=*/SourceLocation(), /*QualifierLoc=*/NestedNameSpecifierLoc(), TemplateKWLoc, - /*UnqualifiedLookups=*/std::nullopt, NameInfo, TemplateArgs); + /*FirstQualifierFoundInScope=*/nullptr, NameInfo, TemplateArgs); } return BuildDependentDeclRefExpr(SS, TemplateKWLoc, NameInfo, TemplateArgs); } @@ -1758,7 +1764,7 @@ static void SetNestedNameSpecifier(Sema &S, TagDecl *T, // Returns the template parameter list with all default template argument // information. -static TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD) { +TemplateParameterList *Sema::GetTemplateParameterList(TemplateDecl *TD) { // Make sure we get the template parameter list from the most // recent declaration, since that is the only one that is guaranteed to // have all the default template argument information. @@ -2167,1385 +2173,6 @@ DeclResult Sema::CheckClassTemplate( return NewTemplate; } -namespace { -/// Tree transform to "extract" a transformed type from a class template's -/// constructor to a deduction guide. -class ExtractTypeForDeductionGuide - : public TreeTransform { - llvm::SmallVectorImpl &MaterializedTypedefs; - ClassTemplateDecl *NestedPattern; - const MultiLevelTemplateArgumentList *OuterInstantiationArgs; - std::optional TypedefNameInstantiator; - -public: - typedef TreeTransform Base; - ExtractTypeForDeductionGuide( - Sema &SemaRef, - llvm::SmallVectorImpl &MaterializedTypedefs, - ClassTemplateDecl *NestedPattern, - const MultiLevelTemplateArgumentList *OuterInstantiationArgs) - : Base(SemaRef), MaterializedTypedefs(MaterializedTypedefs), - NestedPattern(NestedPattern), - OuterInstantiationArgs(OuterInstantiationArgs) { - if (OuterInstantiationArgs) - TypedefNameInstantiator.emplace( - SemaRef, SemaRef.getASTContext().getTranslationUnitDecl(), - *OuterInstantiationArgs); - } - - TypeSourceInfo *transform(TypeSourceInfo *TSI) { return TransformType(TSI); } - - /// Returns true if it's safe to substitute \p Typedef with - /// \p OuterInstantiationArgs. - bool mightReferToOuterTemplateParameters(TypedefNameDecl *Typedef) { - if (!NestedPattern) - return false; - - static auto WalkUp = [](DeclContext *DC, DeclContext *TargetDC) { - if (DC->Equals(TargetDC)) - return true; - while (DC->isRecord()) { - if (DC->Equals(TargetDC)) - return true; - DC = DC->getParent(); - } - return false; - }; - - if (WalkUp(Typedef->getDeclContext(), NestedPattern->getTemplatedDecl())) - return true; - if (WalkUp(NestedPattern->getTemplatedDecl(), Typedef->getDeclContext())) - return true; - return false; - } - - QualType - RebuildTemplateSpecializationType(TemplateName Template, - SourceLocation TemplateNameLoc, - TemplateArgumentListInfo &TemplateArgs) { - if (!OuterInstantiationArgs || - !isa_and_present(Template.getAsTemplateDecl())) - return Base::RebuildTemplateSpecializationType(Template, TemplateNameLoc, - TemplateArgs); - - auto *TATD = cast(Template.getAsTemplateDecl()); - auto *Pattern = TATD; - while (Pattern->getInstantiatedFromMemberTemplate()) - Pattern = Pattern->getInstantiatedFromMemberTemplate(); - if (!mightReferToOuterTemplateParameters(Pattern->getTemplatedDecl())) - return Base::RebuildTemplateSpecializationType(Template, TemplateNameLoc, - TemplateArgs); - - Decl *NewD = - TypedefNameInstantiator->InstantiateTypeAliasTemplateDecl(TATD); - if (!NewD) - return QualType(); - - auto *NewTATD = cast(NewD); - MaterializedTypedefs.push_back(NewTATD->getTemplatedDecl()); - - return Base::RebuildTemplateSpecializationType( - TemplateName(NewTATD), TemplateNameLoc, TemplateArgs); - } - - QualType TransformTypedefType(TypeLocBuilder &TLB, TypedefTypeLoc TL) { - ASTContext &Context = SemaRef.getASTContext(); - TypedefNameDecl *OrigDecl = TL.getTypedefNameDecl(); - TypedefNameDecl *Decl = OrigDecl; - // Transform the underlying type of the typedef and clone the Decl only if - // the typedef has a dependent context. - bool InDependentContext = OrigDecl->getDeclContext()->isDependentContext(); - - // A typedef/alias Decl within the NestedPattern may reference the outer - // template parameters. They're substituted with corresponding instantiation - // arguments here and in RebuildTemplateSpecializationType() above. - // Otherwise, we would have a CTAD guide with "dangling" template - // parameters. - // For example, - // template struct Outer { - // using Alias = S; - // template struct Inner { - // Inner(Alias); - // }; - // }; - if (OuterInstantiationArgs && InDependentContext && - TL.getTypePtr()->isInstantiationDependentType()) { - Decl = cast_if_present( - TypedefNameInstantiator->InstantiateTypedefNameDecl( - OrigDecl, /*IsTypeAlias=*/isa(OrigDecl))); - if (!Decl) - return QualType(); - MaterializedTypedefs.push_back(Decl); - } else if (InDependentContext) { - TypeLocBuilder InnerTLB; - QualType Transformed = - TransformType(InnerTLB, OrigDecl->getTypeSourceInfo()->getTypeLoc()); - TypeSourceInfo *TSI = InnerTLB.getTypeSourceInfo(Context, Transformed); - if (isa(OrigDecl)) - Decl = TypeAliasDecl::Create( - Context, Context.getTranslationUnitDecl(), OrigDecl->getBeginLoc(), - OrigDecl->getLocation(), OrigDecl->getIdentifier(), TSI); - else { - assert(isa(OrigDecl) && "Not a Type alias or typedef"); - Decl = TypedefDecl::Create( - Context, Context.getTranslationUnitDecl(), OrigDecl->getBeginLoc(), - OrigDecl->getLocation(), OrigDecl->getIdentifier(), TSI); - } - MaterializedTypedefs.push_back(Decl); - } - - QualType TDTy = Context.getTypedefType(Decl); - TypedefTypeLoc TypedefTL = TLB.push(TDTy); - TypedefTL.setNameLoc(TL.getNameLoc()); - - return TDTy; - } -}; - -// Build a deduction guide using the provided information. -// -// A deduction guide can be either a template or a non-template function -// declaration. If \p TemplateParams is null, a non-template function -// declaration will be created. -NamedDecl *buildDeductionGuide( - Sema &SemaRef, TemplateDecl *OriginalTemplate, - TemplateParameterList *TemplateParams, CXXConstructorDecl *Ctor, - ExplicitSpecifier ES, TypeSourceInfo *TInfo, SourceLocation LocStart, - SourceLocation Loc, SourceLocation LocEnd, bool IsImplicit, - llvm::ArrayRef MaterializedTypedefs = {}) { - DeclContext *DC = OriginalTemplate->getDeclContext(); - auto DeductionGuideName = - SemaRef.Context.DeclarationNames.getCXXDeductionGuideName( - OriginalTemplate); - - DeclarationNameInfo Name(DeductionGuideName, Loc); - ArrayRef Params = - TInfo->getTypeLoc().castAs().getParams(); - - // Build the implicit deduction guide template. - auto *Guide = - CXXDeductionGuideDecl::Create(SemaRef.Context, DC, LocStart, ES, Name, - TInfo->getType(), TInfo, LocEnd, Ctor); - Guide->setImplicit(IsImplicit); - Guide->setParams(Params); - - for (auto *Param : Params) - Param->setDeclContext(Guide); - for (auto *TD : MaterializedTypedefs) - TD->setDeclContext(Guide); - if (isa(DC)) - Guide->setAccess(AS_public); - - if (!TemplateParams) { - DC->addDecl(Guide); - return Guide; - } - - auto *GuideTemplate = FunctionTemplateDecl::Create( - SemaRef.Context, DC, Loc, DeductionGuideName, TemplateParams, Guide); - GuideTemplate->setImplicit(IsImplicit); - Guide->setDescribedFunctionTemplate(GuideTemplate); - - if (isa(DC)) - GuideTemplate->setAccess(AS_public); - - DC->addDecl(GuideTemplate); - return GuideTemplate; -} - -// Transform a given template type parameter `TTP`. -TemplateTypeParmDecl * -transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC, - TemplateTypeParmDecl *TTP, - MultiLevelTemplateArgumentList &Args, - unsigned NewDepth, unsigned NewIndex) { - // TemplateTypeParmDecl's index cannot be changed after creation, so - // substitute it directly. - auto *NewTTP = TemplateTypeParmDecl::Create( - SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(), NewDepth, - NewIndex, TTP->getIdentifier(), TTP->wasDeclaredWithTypename(), - TTP->isParameterPack(), TTP->hasTypeConstraint(), - TTP->isExpandedParameterPack() - ? std::optional(TTP->getNumExpansionParameters()) - : std::nullopt); - if (const auto *TC = TTP->getTypeConstraint()) - SemaRef.SubstTypeConstraint(NewTTP, TC, Args, - /*EvaluateConstraint=*/true); - if (TTP->hasDefaultArgument()) { - TemplateArgumentLoc InstantiatedDefaultArg; - if (!SemaRef.SubstTemplateArgument( - TTP->getDefaultArgument(), Args, InstantiatedDefaultArg, - TTP->getDefaultArgumentLoc(), TTP->getDeclName())) - NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg); - } - SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP); - return NewTTP; -} -// Similar to above, but for non-type template or template template parameters. -template -NonTypeTemplateOrTemplateTemplateParmDecl * -transformTemplateParam(Sema &SemaRef, DeclContext *DC, - NonTypeTemplateOrTemplateTemplateParmDecl *OldParam, - MultiLevelTemplateArgumentList &Args, unsigned NewIndex, - unsigned NewDepth) { - // Ask the template instantiator to do the heavy lifting for us, then adjust - // the index of the parameter once it's done. - auto *NewParam = cast( - SemaRef.SubstDecl(OldParam, DC, Args)); - NewParam->setPosition(NewIndex); - NewParam->setDepth(NewDepth); - return NewParam; -} - -/// Transform to convert portions of a constructor declaration into the -/// corresponding deduction guide, per C++1z [over.match.class.deduct]p1. -struct ConvertConstructorToDeductionGuideTransform { - ConvertConstructorToDeductionGuideTransform(Sema &S, - ClassTemplateDecl *Template) - : SemaRef(S), Template(Template) { - // If the template is nested, then we need to use the original - // pattern to iterate over the constructors. - ClassTemplateDecl *Pattern = Template; - while (Pattern->getInstantiatedFromMemberTemplate()) { - if (Pattern->isMemberSpecialization()) - break; - Pattern = Pattern->getInstantiatedFromMemberTemplate(); - NestedPattern = Pattern; - } - - if (NestedPattern) - OuterInstantiationArgs = SemaRef.getTemplateInstantiationArgs(Template); - } - - Sema &SemaRef; - ClassTemplateDecl *Template; - ClassTemplateDecl *NestedPattern = nullptr; - - DeclContext *DC = Template->getDeclContext(); - CXXRecordDecl *Primary = Template->getTemplatedDecl(); - DeclarationName DeductionGuideName = - SemaRef.Context.DeclarationNames.getCXXDeductionGuideName(Template); - - QualType DeducedType = SemaRef.Context.getTypeDeclType(Primary); - - // Index adjustment to apply to convert depth-1 template parameters into - // depth-0 template parameters. - unsigned Depth1IndexAdjustment = Template->getTemplateParameters()->size(); - - // Instantiation arguments for the outermost depth-1 templates - // when the template is nested - MultiLevelTemplateArgumentList OuterInstantiationArgs; - - /// Transform a constructor declaration into a deduction guide. - NamedDecl *transformConstructor(FunctionTemplateDecl *FTD, - CXXConstructorDecl *CD) { - SmallVector SubstArgs; - - LocalInstantiationScope Scope(SemaRef); - - // C++ [over.match.class.deduct]p1: - // -- For each constructor of the class template designated by the - // template-name, a function template with the following properties: - - // -- The template parameters are the template parameters of the class - // template followed by the template parameters (including default - // template arguments) of the constructor, if any. - TemplateParameterList *TemplateParams = GetTemplateParameterList(Template); - if (FTD) { - TemplateParameterList *InnerParams = FTD->getTemplateParameters(); - SmallVector AllParams; - SmallVector Depth1Args; - AllParams.reserve(TemplateParams->size() + InnerParams->size()); - AllParams.insert(AllParams.begin(), - TemplateParams->begin(), TemplateParams->end()); - SubstArgs.reserve(InnerParams->size()); - Depth1Args.reserve(InnerParams->size()); - - // Later template parameters could refer to earlier ones, so build up - // a list of substituted template arguments as we go. - for (NamedDecl *Param : *InnerParams) { - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(Depth1Args); - Args.addOuterRetainedLevel(); - if (NestedPattern) - Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); - NamedDecl *NewParam = transformTemplateParameter(Param, Args); - if (!NewParam) - return nullptr; - // Constraints require that we substitute depth-1 arguments - // to match depths when substituted for evaluation later - Depth1Args.push_back(SemaRef.Context.getCanonicalTemplateArgument( - SemaRef.Context.getInjectedTemplateArg(NewParam))); - - if (NestedPattern) { - TemplateDeclInstantiator Instantiator(SemaRef, DC, - OuterInstantiationArgs); - Instantiator.setEvaluateConstraints(false); - SemaRef.runWithSufficientStackSpace(NewParam->getLocation(), [&] { - NewParam = cast(Instantiator.Visit(NewParam)); - }); - } - - assert(NewParam->getTemplateDepth() == 0 && - "Unexpected template parameter depth"); - - AllParams.push_back(NewParam); - SubstArgs.push_back(SemaRef.Context.getCanonicalTemplateArgument( - SemaRef.Context.getInjectedTemplateArg(NewParam))); - } - - // Substitute new template parameters into requires-clause if present. - Expr *RequiresClause = nullptr; - if (Expr *InnerRC = InnerParams->getRequiresClause()) { - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(Depth1Args); - Args.addOuterRetainedLevel(); - if (NestedPattern) - Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); - ExprResult E = SemaRef.SubstExpr(InnerRC, Args); - if (E.isInvalid()) - return nullptr; - RequiresClause = E.getAs(); - } - - TemplateParams = TemplateParameterList::Create( - SemaRef.Context, InnerParams->getTemplateLoc(), - InnerParams->getLAngleLoc(), AllParams, InnerParams->getRAngleLoc(), - RequiresClause); - } - - // If we built a new template-parameter-list, track that we need to - // substitute references to the old parameters into references to the - // new ones. - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - if (FTD) { - Args.addOuterTemplateArguments(SubstArgs); - Args.addOuterRetainedLevel(); - } - - FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo()->getTypeLoc() - .getAsAdjusted(); - assert(FPTL && "no prototype for constructor declaration"); - - // Transform the type of the function, adjusting the return type and - // replacing references to the old parameters with references to the - // new ones. - TypeLocBuilder TLB; - SmallVector Params; - SmallVector MaterializedTypedefs; - QualType NewType = transformFunctionProtoType(TLB, FPTL, Params, Args, - MaterializedTypedefs); - if (NewType.isNull()) - return nullptr; - TypeSourceInfo *NewTInfo = TLB.getTypeSourceInfo(SemaRef.Context, NewType); - - return buildDeductionGuide( - SemaRef, Template, TemplateParams, CD, CD->getExplicitSpecifier(), - NewTInfo, CD->getBeginLoc(), CD->getLocation(), CD->getEndLoc(), - /*IsImplicit=*/true, MaterializedTypedefs); - } - - /// Build a deduction guide with the specified parameter types. - NamedDecl *buildSimpleDeductionGuide(MutableArrayRef ParamTypes) { - SourceLocation Loc = Template->getLocation(); - - // Build the requested type. - FunctionProtoType::ExtProtoInfo EPI; - EPI.HasTrailingReturn = true; - QualType Result = SemaRef.BuildFunctionType(DeducedType, ParamTypes, Loc, - DeductionGuideName, EPI); - TypeSourceInfo *TSI = SemaRef.Context.getTrivialTypeSourceInfo(Result, Loc); - if (NestedPattern) - TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc, - DeductionGuideName); - - if (!TSI) - return nullptr; - - FunctionProtoTypeLoc FPTL = - TSI->getTypeLoc().castAs(); - - // Build the parameters, needed during deduction / substitution. - SmallVector Params; - for (auto T : ParamTypes) { - auto *TSI = SemaRef.Context.getTrivialTypeSourceInfo(T, Loc); - if (NestedPattern) - TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc, - DeclarationName()); - if (!TSI) - return nullptr; - - ParmVarDecl *NewParam = - ParmVarDecl::Create(SemaRef.Context, DC, Loc, Loc, nullptr, - TSI->getType(), TSI, SC_None, nullptr); - NewParam->setScopeInfo(0, Params.size()); - FPTL.setParam(Params.size(), NewParam); - Params.push_back(NewParam); - } - - return buildDeductionGuide( - SemaRef, Template, GetTemplateParameterList(Template), nullptr, - ExplicitSpecifier(), TSI, Loc, Loc, Loc, /*IsImplicit=*/true); - } - -private: - /// Transform a constructor template parameter into a deduction guide template - /// parameter, rebuilding any internal references to earlier parameters and - /// renumbering as we go. - NamedDecl *transformTemplateParameter(NamedDecl *TemplateParam, - MultiLevelTemplateArgumentList &Args) { - if (auto *TTP = dyn_cast(TemplateParam)) - return transformTemplateTypeParam( - SemaRef, DC, TTP, Args, TTP->getDepth() - 1, - Depth1IndexAdjustment + TTP->getIndex()); - if (auto *TTP = dyn_cast(TemplateParam)) - return transformTemplateParam(SemaRef, DC, TTP, Args, - Depth1IndexAdjustment + TTP->getIndex(), - TTP->getDepth() - 1); - auto *NTTP = cast(TemplateParam); - return transformTemplateParam(SemaRef, DC, NTTP, Args, - Depth1IndexAdjustment + NTTP->getIndex(), - NTTP->getDepth() - 1); - } - - QualType transformFunctionProtoType( - TypeLocBuilder &TLB, FunctionProtoTypeLoc TL, - SmallVectorImpl &Params, - MultiLevelTemplateArgumentList &Args, - SmallVectorImpl &MaterializedTypedefs) { - SmallVector ParamTypes; - const FunctionProtoType *T = TL.getTypePtr(); - - // -- The types of the function parameters are those of the constructor. - for (auto *OldParam : TL.getParams()) { - ParmVarDecl *NewParam = OldParam; - // Given - // template struct C { - // template struct D { - // template D(U, V); - // }; - // }; - // First, transform all the references to template parameters that are - // defined outside of the surrounding class template. That is T in the - // above example. - if (NestedPattern) { - NewParam = transformFunctionTypeParam( - NewParam, OuterInstantiationArgs, MaterializedTypedefs, - /*TransformingOuterPatterns=*/true); - if (!NewParam) - return QualType(); - } - // Then, transform all the references to template parameters that are - // defined at the class template and the constructor. In this example, - // they're U and V, respectively. - NewParam = - transformFunctionTypeParam(NewParam, Args, MaterializedTypedefs, - /*TransformingOuterPatterns=*/false); - if (!NewParam) - return QualType(); - ParamTypes.push_back(NewParam->getType()); - Params.push_back(NewParam); - } - - // -- The return type is the class template specialization designated by - // the template-name and template arguments corresponding to the - // template parameters obtained from the class template. - // - // We use the injected-class-name type of the primary template instead. - // This has the convenient property that it is different from any type that - // the user can write in a deduction-guide (because they cannot enter the - // context of the template), so implicit deduction guides can never collide - // with explicit ones. - QualType ReturnType = DeducedType; - TLB.pushTypeSpec(ReturnType).setNameLoc(Primary->getLocation()); - - // Resolving a wording defect, we also inherit the variadicness of the - // constructor. - FunctionProtoType::ExtProtoInfo EPI; - EPI.Variadic = T->isVariadic(); - EPI.HasTrailingReturn = true; - - QualType Result = SemaRef.BuildFunctionType( - ReturnType, ParamTypes, TL.getBeginLoc(), DeductionGuideName, EPI); - if (Result.isNull()) - return QualType(); - - FunctionProtoTypeLoc NewTL = TLB.push(Result); - NewTL.setLocalRangeBegin(TL.getLocalRangeBegin()); - NewTL.setLParenLoc(TL.getLParenLoc()); - NewTL.setRParenLoc(TL.getRParenLoc()); - NewTL.setExceptionSpecRange(SourceRange()); - NewTL.setLocalRangeEnd(TL.getLocalRangeEnd()); - for (unsigned I = 0, E = NewTL.getNumParams(); I != E; ++I) - NewTL.setParam(I, Params[I]); - - return Result; - } - - ParmVarDecl *transformFunctionTypeParam( - ParmVarDecl *OldParam, MultiLevelTemplateArgumentList &Args, - llvm::SmallVectorImpl &MaterializedTypedefs, - bool TransformingOuterPatterns) { - TypeSourceInfo *OldDI = OldParam->getTypeSourceInfo(); - TypeSourceInfo *NewDI; - if (auto PackTL = OldDI->getTypeLoc().getAs()) { - // Expand out the one and only element in each inner pack. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, 0); - NewDI = - SemaRef.SubstType(PackTL.getPatternLoc(), Args, - OldParam->getLocation(), OldParam->getDeclName()); - if (!NewDI) return nullptr; - NewDI = - SemaRef.CheckPackExpansion(NewDI, PackTL.getEllipsisLoc(), - PackTL.getTypePtr()->getNumExpansions()); - } else - NewDI = SemaRef.SubstType(OldDI, Args, OldParam->getLocation(), - OldParam->getDeclName()); - if (!NewDI) - return nullptr; - - // Extract the type. This (for instance) replaces references to typedef - // members of the current instantiations with the definitions of those - // typedefs, avoiding triggering instantiation of the deduced type during - // deduction. - NewDI = ExtractTypeForDeductionGuide( - SemaRef, MaterializedTypedefs, NestedPattern, - TransformingOuterPatterns ? &Args : nullptr) - .transform(NewDI); - - // Resolving a wording defect, we also inherit default arguments from the - // constructor. - ExprResult NewDefArg; - if (OldParam->hasDefaultArg()) { - // We don't care what the value is (we won't use it); just create a - // placeholder to indicate there is a default argument. - QualType ParamTy = NewDI->getType(); - NewDefArg = new (SemaRef.Context) - OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(), - ParamTy.getNonLValueExprType(SemaRef.Context), - ParamTy->isLValueReferenceType() ? VK_LValue - : ParamTy->isRValueReferenceType() ? VK_XValue - : VK_PRValue); - } - // Handle arrays and functions decay. - auto NewType = NewDI->getType(); - if (NewType->isArrayType() || NewType->isFunctionType()) - NewType = SemaRef.Context.getDecayedType(NewType); - - ParmVarDecl *NewParam = ParmVarDecl::Create( - SemaRef.Context, DC, OldParam->getInnerLocStart(), - OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewDI, - OldParam->getStorageClass(), NewDefArg.get()); - NewParam->setScopeInfo(OldParam->getFunctionScopeDepth(), - OldParam->getFunctionScopeIndex()); - SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldParam, NewParam); - return NewParam; - } -}; - -unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) { - if (auto *TTP = dyn_cast(TemplateParam)) - return TTP->getDepth(); - if (auto *TTP = dyn_cast(TemplateParam)) - return TTP->getDepth(); - if (auto *NTTP = dyn_cast(TemplateParam)) - return NTTP->getDepth(); - llvm_unreachable("Unhandled template parameter types"); -} - -unsigned getTemplateParameterIndex(NamedDecl *TemplateParam) { - if (auto *TTP = dyn_cast(TemplateParam)) - return TTP->getIndex(); - if (auto *TTP = dyn_cast(TemplateParam)) - return TTP->getIndex(); - if (auto *NTTP = dyn_cast(TemplateParam)) - return NTTP->getIndex(); - llvm_unreachable("Unhandled template parameter types"); -} - -// Find all template parameters that appear in the given DeducedArgs. -// Return the indices of the template parameters in the TemplateParams. -SmallVector TemplateParamsReferencedInTemplateArgumentList( - const TemplateParameterList *TemplateParamsList, - ArrayRef DeducedArgs) { - struct TemplateParamsReferencedFinder - : public RecursiveASTVisitor { - const TemplateParameterList *TemplateParamList; - llvm::BitVector ReferencedTemplateParams; - - TemplateParamsReferencedFinder( - const TemplateParameterList *TemplateParamList) - : TemplateParamList(TemplateParamList), - ReferencedTemplateParams(TemplateParamList->size()) {} - - bool VisitTemplateTypeParmType(TemplateTypeParmType *TTP) { - // We use the index and depth to retrieve the corresponding template - // parameter from the parameter list, which is more robost. - Mark(TTP->getDepth(), TTP->getIndex()); - return true; - } - - bool VisitDeclRefExpr(DeclRefExpr *DRE) { - MarkAppeared(DRE->getFoundDecl()); - return true; - } - - bool TraverseTemplateName(TemplateName Template) { - if (auto *TD = Template.getAsTemplateDecl()) - MarkAppeared(TD); - return RecursiveASTVisitor::TraverseTemplateName(Template); - } - - void MarkAppeared(NamedDecl *ND) { - if (llvm::isa(ND)) - Mark(getTemplateParameterDepth(ND), getTemplateParameterIndex(ND)); - } - void Mark(unsigned Depth, unsigned Index) { - if (Index < TemplateParamList->size() && - TemplateParamList->getParam(Index)->getTemplateDepth() == Depth) - ReferencedTemplateParams.set(Index); - } - }; - TemplateParamsReferencedFinder Finder(TemplateParamsList); - Finder.TraverseTemplateArguments(DeducedArgs); - - SmallVector Results; - for (unsigned Index = 0; Index < TemplateParamsList->size(); ++Index) { - if (Finder.ReferencedTemplateParams[Index]) - Results.push_back(Index); - } - return Results; -} - -bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) { - // Check whether we've already declared deduction guides for this template. - // FIXME: Consider storing a flag on the template to indicate this. - assert(Name.getNameKind() == - DeclarationName::NameKind::CXXDeductionGuideName && - "name must be a deduction guide name"); - auto Existing = DC->lookup(Name); - for (auto *D : Existing) - if (D->isImplicit()) - return true; - return false; -} - -NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC, - NamedDecl *TemplateParam, - MultiLevelTemplateArgumentList &Args, - unsigned NewIndex, unsigned NewDepth) { - if (auto *TTP = dyn_cast(TemplateParam)) - return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth, - NewIndex); - if (auto *TTP = dyn_cast(TemplateParam)) - return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth); - if (auto *NTTP = dyn_cast(TemplateParam)) - return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth); - llvm_unreachable("Unhandled template parameter types"); -} - -// Build the associated constraints for the alias deduction guides. -// C++ [over.match.class.deduct]p3.3: -// The associated constraints ([temp.constr.decl]) are the conjunction of the -// associated constraints of g and a constraint that is satisfied if and only -// if the arguments of A are deducible (see below) from the return type. -// -// The return result is expected to be the require-clause for the synthesized -// alias deduction guide. -Expr * -buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F, - TypeAliasTemplateDecl *AliasTemplate, - ArrayRef DeduceResults, - unsigned FirstUndeducedParamIdx, Expr *IsDeducible) { - Expr *RC = F->getTemplateParameters()->getRequiresClause(); - if (!RC) - return IsDeducible; - - ASTContext &Context = SemaRef.Context; - LocalInstantiationScope Scope(SemaRef); - - // In the clang AST, constraint nodes are deliberately not instantiated unless - // they are actively being evaluated. Consequently, occurrences of template - // parameters in the require-clause expression have a subtle "depth" - // difference compared to normal occurrences in places, such as function - // parameters. When transforming the require-clause, we must take this - // distinction into account: - // - // 1) In the transformed require-clause, occurrences of template parameters - // must use the "uninstantiated" depth; - // 2) When substituting on the require-clause expr of the underlying - // deduction guide, we must use the entire set of template argument lists; - // - // It's important to note that we're performing this transformation on an - // *instantiated* AliasTemplate. - - // For 1), if the alias template is nested within a class template, we - // calcualte the 'uninstantiated' depth by adding the substitution level back. - unsigned AdjustDepth = 0; - if (auto *PrimaryTemplate = - AliasTemplate->getInstantiatedFromMemberTemplate()) - AdjustDepth = PrimaryTemplate->getTemplateDepth(); - - // We rebuild all template parameters with the uninstantiated depth, and - // build template arguments refer to them. - SmallVector AdjustedAliasTemplateArgs; - - for (auto *TP : *AliasTemplate->getTemplateParameters()) { - // Rebuild any internal references to earlier parameters and reindex - // as we go. - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs); - NamedDecl *NewParam = transformTemplateParameter( - SemaRef, AliasTemplate->getDeclContext(), TP, Args, - /*NewIndex=*/AdjustedAliasTemplateArgs.size(), - getTemplateParameterDepth(TP) + AdjustDepth); - - auto NewTemplateArgument = Context.getCanonicalTemplateArgument( - Context.getInjectedTemplateArg(NewParam)); - AdjustedAliasTemplateArgs.push_back(NewTemplateArgument); - } - // Template arguments used to transform the template arguments in - // DeducedResults. - SmallVector TemplateArgsForBuildingRC( - F->getTemplateParameters()->size()); - // Transform the transformed template args - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs); - - for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { - const auto &D = DeduceResults[Index]; - if (D.isNull()) { // non-deduced template parameters of f - NamedDecl *TP = F->getTemplateParameters()->getParam(Index); - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(TemplateArgsForBuildingRC); - // Rebuild the template parameter with updated depth and index. - NamedDecl *NewParam = transformTemplateParameter( - SemaRef, F->getDeclContext(), TP, Args, - /*NewIndex=*/FirstUndeducedParamIdx, - getTemplateParameterDepth(TP) + AdjustDepth); - FirstUndeducedParamIdx += 1; - assert(TemplateArgsForBuildingRC[Index].isNull()); - TemplateArgsForBuildingRC[Index] = Context.getCanonicalTemplateArgument( - Context.getInjectedTemplateArg(NewParam)); - continue; - } - TemplateArgumentLoc Input = - SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); - TemplateArgumentLoc Output; - if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { - assert(TemplateArgsForBuildingRC[Index].isNull() && - "InstantiatedArgs must be null before setting"); - TemplateArgsForBuildingRC[Index] = Output.getArgument(); - } - } - - // A list of template arguments for transforming the require-clause of F. - // It must contain the entire set of template argument lists. - MultiLevelTemplateArgumentList ArgsForBuildingRC; - ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite); - ArgsForBuildingRC.addOuterTemplateArguments(TemplateArgsForBuildingRC); - // For 2), if the underlying deduction guide F is nested in a class template, - // we need the entire template argument list, as the constraint AST in the - // require-clause of F remains completely uninstantiated. - // - // For example: - // template // depth 0 - // struct Outer { - // template - // struct Foo { Foo(U); }; - // - // template // depth 1 - // requires C - // Foo(U) -> Foo; - // }; - // template - // using AFoo = Outer::Foo; - // - // In this scenario, the deduction guide for `Foo` inside `Outer`: - // - The occurrence of U in the require-expression is [depth:1, index:0] - // - The occurrence of U in the function parameter is [depth:0, index:0] - // - The template parameter of U is [depth:0, index:0] - // - // We add the outer template arguments which is [int] to the multi-level arg - // list to ensure that the occurrence U in `C` will be replaced with int - // during the substitution. - // - // NOTE: The underlying deduction guide F is instantiated -- either from an - // explicitly-written deduction guide member, or from a constructor. - // getInstantiatedFromMemberTemplate() can only handle the former case, so we - // check the DeclContext kind. - if (F->getLexicalDeclContext()->getDeclKind() == - clang::Decl::ClassTemplateSpecialization) { - auto OuterLevelArgs = SemaRef.getTemplateInstantiationArgs( - F, F->getLexicalDeclContext(), - /*Final=*/false, /*Innermost=*/std::nullopt, - /*RelativeToPrimary=*/true, - /*Pattern=*/nullptr, - /*ForConstraintInstantiation=*/true); - for (auto It : OuterLevelArgs) - ArgsForBuildingRC.addOuterTemplateArguments(It.Args); - } - - ExprResult E = SemaRef.SubstExpr(RC, ArgsForBuildingRC); - if (E.isInvalid()) - return nullptr; - - auto Conjunction = - SemaRef.BuildBinOp(SemaRef.getCurScope(), SourceLocation{}, - BinaryOperatorKind::BO_LAnd, E.get(), IsDeducible); - if (Conjunction.isInvalid()) - return nullptr; - return Conjunction.getAs(); -} -// Build the is_deducible constraint for the alias deduction guides. -// [over.match.class.deduct]p3.3: -// ... and a constraint that is satisfied if and only if the arguments -// of A are deducible (see below) from the return type. -Expr *buildIsDeducibleConstraint(Sema &SemaRef, - TypeAliasTemplateDecl *AliasTemplate, - QualType ReturnType, - SmallVector TemplateParams) { - ASTContext &Context = SemaRef.Context; - // Constraint AST nodes must use uninstantiated depth. - if (auto *PrimaryTemplate = - AliasTemplate->getInstantiatedFromMemberTemplate(); - PrimaryTemplate && TemplateParams.size() > 0) { - LocalInstantiationScope Scope(SemaRef); - - // Adjust the depth for TemplateParams. - unsigned AdjustDepth = PrimaryTemplate->getTemplateDepth(); - SmallVector TransformedTemplateArgs; - for (auto *TP : TemplateParams) { - // Rebuild any internal references to earlier parameters and reindex - // as we go. - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(TransformedTemplateArgs); - NamedDecl *NewParam = transformTemplateParameter( - SemaRef, AliasTemplate->getDeclContext(), TP, Args, - /*NewIndex=*/TransformedTemplateArgs.size(), - getTemplateParameterDepth(TP) + AdjustDepth); - - auto NewTemplateArgument = Context.getCanonicalTemplateArgument( - Context.getInjectedTemplateArg(NewParam)); - TransformedTemplateArgs.push_back(NewTemplateArgument); - } - // Transformed the ReturnType to restore the uninstantiated depth. - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(TransformedTemplateArgs); - ReturnType = SemaRef.SubstType( - ReturnType, Args, AliasTemplate->getLocation(), - Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate)); - }; - - SmallVector IsDeducibleTypeTraitArgs = { - Context.getTrivialTypeSourceInfo( - Context.getDeducedTemplateSpecializationType( - TemplateName(AliasTemplate), /*DeducedType=*/QualType(), - /*IsDependent=*/true)), // template specialization type whose - // arguments will be deduced. - Context.getTrivialTypeSourceInfo( - ReturnType), // type from which template arguments are deduced. - }; - return TypeTraitExpr::Create( - Context, Context.getLogicalOperationType(), AliasTemplate->getLocation(), - TypeTrait::BTT_IsDeducible, IsDeducibleTypeTraitArgs, - AliasTemplate->getLocation(), /*Value*/ false); -} - -std::pair> -getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) { - // Unwrap the sugared ElaboratedType. - auto RhsType = AliasTemplate->getTemplatedDecl() - ->getUnderlyingType() - .getSingleStepDesugaredType(SemaRef.Context); - TemplateDecl *Template = nullptr; - llvm::ArrayRef AliasRhsTemplateArgs; - if (const auto *TST = RhsType->getAs()) { - // Cases where the RHS of the alias is dependent. e.g. - // template - // using AliasFoo1 = Foo; // a class/type alias template specialization - Template = TST->getTemplateName().getAsTemplateDecl(); - AliasRhsTemplateArgs = TST->template_arguments(); - } else if (const auto *RT = RhsType->getAs()) { - // Cases where template arguments in the RHS of the alias are not - // dependent. e.g. - // using AliasFoo = Foo; - if (const auto *CTSD = llvm::dyn_cast( - RT->getAsCXXRecordDecl())) { - Template = CTSD->getSpecializedTemplate(); - AliasRhsTemplateArgs = CTSD->getTemplateArgs().asArray(); - } - } else { - assert(false && "unhandled RHS type of the alias"); - } - return {Template, AliasRhsTemplateArgs}; -} - -// Build deduction guides for a type alias template from the given underlying -// deduction guide F. -FunctionTemplateDecl * -BuildDeductionGuideForTypeAlias(Sema &SemaRef, - TypeAliasTemplateDecl *AliasTemplate, - FunctionTemplateDecl *F, SourceLocation Loc) { - LocalInstantiationScope Scope(SemaRef); - Sema::InstantiatingTemplate BuildingDeductionGuides( - SemaRef, AliasTemplate->getLocation(), F, - Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); - if (BuildingDeductionGuides.isInvalid()) - return nullptr; - - auto &Context = SemaRef.Context; - auto [Template, AliasRhsTemplateArgs] = - getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate); - - auto RType = F->getTemplatedDecl()->getReturnType(); - // The (trailing) return type of the deduction guide. - const TemplateSpecializationType *FReturnType = - RType->getAs(); - if (const auto *InjectedCNT = RType->getAs()) - // implicitly-generated deduction guide. - FReturnType = InjectedCNT->getInjectedTST(); - else if (const auto *ET = RType->getAs()) - // explicit deduction guide. - FReturnType = ET->getNamedType()->getAs(); - assert(FReturnType && "expected to see a return type"); - // Deduce template arguments of the deduction guide f from the RHS of - // the alias. - // - // C++ [over.match.class.deduct]p3: ...For each function or function - // template f in the guides of the template named by the - // simple-template-id of the defining-type-id, the template arguments - // of the return type of f are deduced from the defining-type-id of A - // according to the process in [temp.deduct.type] with the exception - // that deduction does not fail if not all template arguments are - // deduced. - // - // - // template - // f(X, Y) -> f; - // - // template - // using alias = f; - // - // The RHS of alias is f, we deduced the template arguments of - // the return type of the deduction guide from it: Y->int, X->U - sema::TemplateDeductionInfo TDeduceInfo(Loc); - // Must initialize n elements, this is required by DeduceTemplateArguments. - SmallVector DeduceResults( - F->getTemplateParameters()->size()); - - // FIXME: DeduceTemplateArguments stops immediately at the first - // non-deducible template argument. However, this doesn't seem to casue - // issues for practice cases, we probably need to extend it to continue - // performing deduction for rest of arguments to align with the C++ - // standard. - SemaRef.DeduceTemplateArguments( - F->getTemplateParameters(), FReturnType->template_arguments(), - AliasRhsTemplateArgs, TDeduceInfo, DeduceResults, - /*NumberOfArgumentsMustMatch=*/false); - - SmallVector DeducedArgs; - SmallVector NonDeducedTemplateParamsInFIndex; - // !!NOTE: DeduceResults respects the sequence of template parameters of - // the deduction guide f. - for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { - if (const auto &D = DeduceResults[Index]; !D.isNull()) // Deduced - DeducedArgs.push_back(D); - else - NonDeducedTemplateParamsInFIndex.push_back(Index); - } - auto DeducedAliasTemplateParams = - TemplateParamsReferencedInTemplateArgumentList( - AliasTemplate->getTemplateParameters(), DeducedArgs); - // All template arguments null by default. - SmallVector TemplateArgsForBuildingFPrime( - F->getTemplateParameters()->size()); - - // Create a template parameter list for the synthesized deduction guide f'. - // - // C++ [over.match.class.deduct]p3.2: - // If f is a function template, f' is a function template whose template - // parameter list consists of all the template parameters of A - // (including their default template arguments) that appear in the above - // deductions or (recursively) in their default template arguments - SmallVector FPrimeTemplateParams; - // Store template arguments that refer to the newly-created template - // parameters, used for building `TemplateArgsForBuildingFPrime`. - SmallVector TransformedDeducedAliasArgs( - AliasTemplate->getTemplateParameters()->size()); - - for (unsigned AliasTemplateParamIdx : DeducedAliasTemplateParams) { - auto *TP = - AliasTemplate->getTemplateParameters()->getParam(AliasTemplateParamIdx); - // Rebuild any internal references to earlier parameters and reindex as - // we go. - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(TransformedDeducedAliasArgs); - NamedDecl *NewParam = transformTemplateParameter( - SemaRef, AliasTemplate->getDeclContext(), TP, Args, - /*NewIndex=*/FPrimeTemplateParams.size(), - getTemplateParameterDepth(TP)); - FPrimeTemplateParams.push_back(NewParam); - - auto NewTemplateArgument = Context.getCanonicalTemplateArgument( - Context.getInjectedTemplateArg(NewParam)); - TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument; - } - unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size(); - // ...followed by the template parameters of f that were not deduced - // (including their default template arguments) - for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) { - auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx); - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - // We take a shortcut here, it is ok to reuse the - // TemplateArgsForBuildingFPrime. - Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime); - NamedDecl *NewParam = transformTemplateParameter( - SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(), - getTemplateParameterDepth(TP)); - FPrimeTemplateParams.push_back(NewParam); - - assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() && - "The argument must be null before setting"); - TemplateArgsForBuildingFPrime[FTemplateParamIdx] = - Context.getCanonicalTemplateArgument( - Context.getInjectedTemplateArg(NewParam)); - } - - // To form a deduction guide f' from f, we leverage clang's instantiation - // mechanism, we construct a template argument list where the template - // arguments refer to the newly-created template parameters of f', and - // then apply instantiation on this template argument list to instantiate - // f, this ensures all template parameter occurrences are updated - // correctly. - // - // The template argument list is formed from the `DeducedArgs`, two parts: - // 1) appeared template parameters of alias: transfrom the deduced - // template argument; - // 2) non-deduced template parameters of f: rebuild a - // template argument; - // - // 2) has been built already (when rebuilding the new template - // parameters), we now perform 1). - MultiLevelTemplateArgumentList Args; - Args.setKind(TemplateSubstitutionKind::Rewrite); - Args.addOuterTemplateArguments(TransformedDeducedAliasArgs); - for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { - const auto &D = DeduceResults[Index]; - if (D.isNull()) { - // 2): Non-deduced template parameter has been built already. - assert(!TemplateArgsForBuildingFPrime[Index].isNull() && - "template arguments for non-deduced template parameters should " - "be been set!"); - continue; - } - TemplateArgumentLoc Input = - SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); - TemplateArgumentLoc Output; - if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { - assert(TemplateArgsForBuildingFPrime[Index].isNull() && - "InstantiatedArgs must be null before setting"); - TemplateArgsForBuildingFPrime[Index] = Output.getArgument(); - } - } - - auto *TemplateArgListForBuildingFPrime = - TemplateArgumentList::CreateCopy(Context, TemplateArgsForBuildingFPrime); - // Form the f' by substituting the template arguments into f. - if (auto *FPrime = SemaRef.InstantiateFunctionDeclaration( - F, TemplateArgListForBuildingFPrime, AliasTemplate->getLocation(), - Sema::CodeSynthesisContext::BuildingDeductionGuides)) { - auto *GG = cast(FPrime); - - Expr *IsDeducible = buildIsDeducibleConstraint( - SemaRef, AliasTemplate, FPrime->getReturnType(), FPrimeTemplateParams); - Expr *RequiresClause = - buildAssociatedConstraints(SemaRef, F, AliasTemplate, DeduceResults, - FirstUndeducedParamIdx, IsDeducible); - - auto *FPrimeTemplateParamList = TemplateParameterList::Create( - Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(), - AliasTemplate->getTemplateParameters()->getLAngleLoc(), - FPrimeTemplateParams, - AliasTemplate->getTemplateParameters()->getRAngleLoc(), - /*RequiresClause=*/RequiresClause); - auto *Result = cast(buildDeductionGuide( - SemaRef, AliasTemplate, FPrimeTemplateParamList, - GG->getCorrespondingConstructor(), GG->getExplicitSpecifier(), - GG->getTypeSourceInfo(), AliasTemplate->getBeginLoc(), - AliasTemplate->getLocation(), AliasTemplate->getEndLoc(), - F->isImplicit())); - cast(Result->getTemplatedDecl()) - ->setDeductionCandidateKind(GG->getDeductionCandidateKind()); - return Result; - } - return nullptr; -} - -void DeclareImplicitDeductionGuidesForTypeAlias( - Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate, SourceLocation Loc) { - if (AliasTemplate->isInvalidDecl()) - return; - auto &Context = SemaRef.Context; - // FIXME: if there is an explicit deduction guide after the first use of the - // type alias usage, we will not cover this explicit deduction guide. fix this - // case. - if (hasDeclaredDeductionGuides( - Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate), - AliasTemplate->getDeclContext())) - return; - auto [Template, AliasRhsTemplateArgs] = - getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate); - if (!Template) - return; - DeclarationNameInfo NameInfo( - Context.DeclarationNames.getCXXDeductionGuideName(Template), Loc); - LookupResult Guides(SemaRef, NameInfo, clang::Sema::LookupOrdinaryName); - SemaRef.LookupQualifiedName(Guides, Template->getDeclContext()); - Guides.suppressDiagnostics(); - - for (auto *G : Guides) { - if (auto *DG = dyn_cast(G)) { - // The deduction guide is a non-template function decl, we just clone it. - auto *FunctionType = - SemaRef.Context.getTrivialTypeSourceInfo(DG->getType()); - FunctionProtoTypeLoc FPTL = - FunctionType->getTypeLoc().castAs(); - - // Clone the parameters. - for (unsigned I = 0, N = DG->getNumParams(); I != N; ++I) { - const auto *P = DG->getParamDecl(I); - auto *TSI = SemaRef.Context.getTrivialTypeSourceInfo(P->getType()); - ParmVarDecl *NewParam = ParmVarDecl::Create( - SemaRef.Context, G->getDeclContext(), - DG->getParamDecl(I)->getBeginLoc(), P->getLocation(), nullptr, - TSI->getType(), TSI, SC_None, nullptr); - NewParam->setScopeInfo(0, I); - FPTL.setParam(I, NewParam); - } - auto *Transformed = cast(buildDeductionGuide( - SemaRef, AliasTemplate, /*TemplateParams=*/nullptr, - /*Constructor=*/nullptr, DG->getExplicitSpecifier(), FunctionType, - AliasTemplate->getBeginLoc(), AliasTemplate->getLocation(), - AliasTemplate->getEndLoc(), DG->isImplicit())); - - // FIXME: Here the synthesized deduction guide is not a templated - // function. Per [dcl.decl]p4, the requires-clause shall be present only - // if the declarator declares a templated function, a bug in standard? - auto *Constraint = buildIsDeducibleConstraint( - SemaRef, AliasTemplate, Transformed->getReturnType(), {}); - if (auto *RC = DG->getTrailingRequiresClause()) { - auto Conjunction = - SemaRef.BuildBinOp(SemaRef.getCurScope(), SourceLocation{}, - BinaryOperatorKind::BO_LAnd, RC, Constraint); - if (!Conjunction.isInvalid()) - Constraint = Conjunction.getAs(); - } - Transformed->setTrailingRequiresClause(Constraint); - } - FunctionTemplateDecl *F = dyn_cast(G); - if (!F) - continue; - // The **aggregate** deduction guides are handled in a different code path - // (DeclareAggregateDeductionGuideFromInitList), which involves the tricky - // cache. - if (cast(F->getTemplatedDecl()) - ->getDeductionCandidateKind() == DeductionCandidate::Aggregate) - continue; - - BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, F, Loc); - } -} - -// Build an aggregate deduction guide for a type alias template. -FunctionTemplateDecl *DeclareAggregateDeductionGuideForTypeAlias( - Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate, - MutableArrayRef ParamTypes, SourceLocation Loc) { - TemplateDecl *RHSTemplate = - getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate).first; - if (!RHSTemplate) - return nullptr; - auto *RHSDeductionGuide = SemaRef.DeclareAggregateDeductionGuideFromInitList( - RHSTemplate, ParamTypes, Loc); - if (!RHSDeductionGuide) - return nullptr; - return BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, - RHSDeductionGuide, Loc); -} - -} // namespace - -FunctionTemplateDecl *Sema::DeclareAggregateDeductionGuideFromInitList( - TemplateDecl *Template, MutableArrayRef ParamTypes, - SourceLocation Loc) { - llvm::FoldingSetNodeID ID; - ID.AddPointer(Template); - for (auto &T : ParamTypes) - T.getCanonicalType().Profile(ID); - unsigned Hash = ID.ComputeHash(); - - auto Found = AggregateDeductionCandidates.find(Hash); - if (Found != AggregateDeductionCandidates.end()) { - CXXDeductionGuideDecl *GD = Found->getSecond(); - return GD->getDescribedFunctionTemplate(); - } - - if (auto *AliasTemplate = llvm::dyn_cast(Template)) { - if (auto *FTD = DeclareAggregateDeductionGuideForTypeAlias( - *this, AliasTemplate, ParamTypes, Loc)) { - auto *GD = cast(FTD->getTemplatedDecl()); - GD->setDeductionCandidateKind(DeductionCandidate::Aggregate); - AggregateDeductionCandidates[Hash] = GD; - return FTD; - } - } - - if (CXXRecordDecl *DefRecord = - cast(Template->getTemplatedDecl())->getDefinition()) { - if (TemplateDecl *DescribedTemplate = - DefRecord->getDescribedClassTemplate()) - Template = DescribedTemplate; - } - - DeclContext *DC = Template->getDeclContext(); - if (DC->isDependentContext()) - return nullptr; - - ConvertConstructorToDeductionGuideTransform Transform( - *this, cast(Template)); - if (!isCompleteType(Loc, Transform.DeducedType)) - return nullptr; - - // In case we were expanding a pack when we attempted to declare deduction - // guides, turn off pack expansion for everything we're about to do. - ArgumentPackSubstitutionIndexRAII SubstIndex(*this, - /*NewSubstitutionIndex=*/-1); - // Create a template instantiation record to track the "instantiation" of - // constructors into deduction guides. - InstantiatingTemplate BuildingDeductionGuides( - *this, Loc, Template, - Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); - if (BuildingDeductionGuides.isInvalid()) - return nullptr; - - ClassTemplateDecl *Pattern = - Transform.NestedPattern ? Transform.NestedPattern : Transform.Template; - ContextRAII SavedContext(*this, Pattern->getTemplatedDecl()); - - auto *FTD = cast( - Transform.buildSimpleDeductionGuide(ParamTypes)); - SavedContext.pop(); - auto *GD = cast(FTD->getTemplatedDecl()); - GD->setDeductionCandidateKind(DeductionCandidate::Aggregate); - AggregateDeductionCandidates[Hash] = GD; - return FTD; -} - -void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template, - SourceLocation Loc) { - if (auto *AliasTemplate = llvm::dyn_cast(Template)) { - DeclareImplicitDeductionGuidesForTypeAlias(*this, AliasTemplate, Loc); - return; - } - if (CXXRecordDecl *DefRecord = - cast(Template->getTemplatedDecl())->getDefinition()) { - if (TemplateDecl *DescribedTemplate = DefRecord->getDescribedClassTemplate()) - Template = DescribedTemplate; - } - - DeclContext *DC = Template->getDeclContext(); - if (DC->isDependentContext()) - return; - - ConvertConstructorToDeductionGuideTransform Transform( - *this, cast(Template)); - if (!isCompleteType(Loc, Transform.DeducedType)) - return; - - if (hasDeclaredDeductionGuides(Transform.DeductionGuideName, DC)) - return; - - // In case we were expanding a pack when we attempted to declare deduction - // guides, turn off pack expansion for everything we're about to do. - ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1); - // Create a template instantiation record to track the "instantiation" of - // constructors into deduction guides. - InstantiatingTemplate BuildingDeductionGuides( - *this, Loc, Template, - Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); - if (BuildingDeductionGuides.isInvalid()) - return; - - // Convert declared constructors into deduction guide templates. - // FIXME: Skip constructors for which deduction must necessarily fail (those - // for which some class template parameter without a default argument never - // appears in a deduced context). - ClassTemplateDecl *Pattern = - Transform.NestedPattern ? Transform.NestedPattern : Transform.Template; - ContextRAII SavedContext(*this, Pattern->getTemplatedDecl()); - llvm::SmallPtrSet ProcessedCtors; - bool AddedAny = false; - for (NamedDecl *D : LookupConstructors(Pattern->getTemplatedDecl())) { - D = D->getUnderlyingDecl(); - if (D->isInvalidDecl() || D->isImplicit()) - continue; - - D = cast(D->getCanonicalDecl()); - - // Within C++20 modules, we may have multiple same constructors in - // multiple same RecordDecls. And it doesn't make sense to create - // duplicated deduction guides for the duplicated constructors. - if (ProcessedCtors.count(D)) - continue; - - auto *FTD = dyn_cast(D); - auto *CD = - dyn_cast_or_null(FTD ? FTD->getTemplatedDecl() : D); - // Class-scope explicit specializations (MS extension) do not result in - // deduction guides. - if (!CD || (!FTD && CD->isFunctionTemplateSpecialization())) - continue; - - // Cannot make a deduction guide when unparsed arguments are present. - if (llvm::any_of(CD->parameters(), [](ParmVarDecl *P) { - return !P || P->hasUnparsedDefaultArg(); - })) - continue; - - ProcessedCtors.insert(D); - Transform.transformConstructor(FTD, CD); - AddedAny = true; - } - - // C++17 [over.match.class.deduct] - // -- If C is not defined or does not declare any constructors, an - // additional function template derived as above from a hypothetical - // constructor C(). - if (!AddedAny) - Transform.buildSimpleDeductionGuide(std::nullopt); - - // -- An additional function template derived as above from a hypothetical - // constructor C(C), called the copy deduction candidate. - cast( - cast( - Transform.buildSimpleDeductionGuide(Transform.DeducedType)) - ->getTemplatedDecl()) - ->setDeductionCandidateKind(DeductionCandidate::Copy); - - SavedContext.pop(); -} - /// Diagnose the presence of a default template argument on a /// template parameter, which is ill-formed in certain contexts. /// @@ -5849,10 +4476,14 @@ ExprResult Sema::BuildQualifiedTemplateIdExpr( return BuildTemplateIdExpr(SS, TemplateKWLoc, R, /*ADL=*/false, TemplateArgs); } -TemplateNameKind Sema::ActOnTemplateName( - Scope *S, CXXScopeSpec &SS, SourceLocation TemplateKWLoc, - const UnqualifiedId &Name, ParsedType ObjectType, bool EnteringContext, - TemplateTy &Result, bool AllowInjectedClassName, bool MayBeNNS) { +TemplateNameKind Sema::ActOnTemplateName(Scope *S, + CXXScopeSpec &SS, + SourceLocation TemplateKWLoc, + const UnqualifiedId &Name, + ParsedType ObjectType, + bool EnteringContext, + TemplateTy &Result, + bool AllowInjectedClassName) { if (TemplateKWLoc.isValid() && S && !S->getTemplateParamParent()) Diag(TemplateKWLoc, getLangOpts().CPlusPlus11 ? @@ -5887,10 +4518,9 @@ TemplateNameKind Sema::ActOnTemplateName( // "template" keyword is now permitted). We follow the C++0x // rules, even in C++03 mode with a warning, retroactively applying the DR. bool MemberOfUnknownSpecialization; - TemplateNameKind TNK = - isTemplateName(S, SS, TemplateKWLoc.isValid(), Name, ObjectType, - EnteringContext, Result, MemberOfUnknownSpecialization, - /*Disambiguation=*/false, MayBeNNS); + TemplateNameKind TNK = isTemplateName(S, SS, TemplateKWLoc.isValid(), Name, + ObjectType, EnteringContext, Result, + MemberOfUnknownSpecialization); if (TNK != TNK_Non_template) { // We resolved this to a (non-dependent) template name. Return it. auto *LookupRD = dyn_cast_or_null(LookupCtx); @@ -5929,8 +4559,7 @@ TemplateNameKind Sema::ActOnTemplateName( ? RequiredTemplateKind(TemplateKWLoc) : TemplateNameIsRequired; if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext, RTK, - /*ATK=*/nullptr, /*AllowTypoCorrection=*/false, - MayBeNNS) && + /*ATK=*/nullptr, /*AllowTypoCorrection=*/false) && !R.isAmbiguous()) { if (LookupCtx) Diag(Name.getBeginLoc(), diag::err_no_member) diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp new file mode 100644 index 00000000000000..7dff2c8f985895 --- /dev/null +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -0,0 +1,1438 @@ +//===- SemaTemplateDeductionGude.cpp - Template Argument Deduction---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements deduction guides for C++ class template argument +// deduction. +// +//===----------------------------------------------------------------------===// + +#include "TreeTransform.h" +#include "TypeLocBuilder.h" +#include "clang/AST/ASTConsumer.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/DeclFriend.h" +#include "clang/AST/DeclTemplate.h" +#include "clang/AST/DeclarationName.h" +#include "clang/AST/Expr.h" +#include "clang/AST/ExprCXX.h" +#include "clang/AST/OperationKinds.h" +#include "clang/AST/RecursiveASTVisitor.h" +#include "clang/AST/TemplateBase.h" +#include "clang/AST/TemplateName.h" +#include "clang/AST/Type.h" +#include "clang/AST/TypeLoc.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/Specifiers.h" +#include "clang/Basic/TypeTraits.h" +#include "clang/Sema/DeclSpec.h" +#include "clang/Sema/Initialization.h" +#include "clang/Sema/Lookup.h" +#include "clang/Sema/Overload.h" +#include "clang/Sema/Ownership.h" +#include "clang/Sema/Scope.h" +#include "clang/Sema/Template.h" +#include "clang/Sema/TemplateDeduction.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include + +using namespace clang; +using namespace sema; + +namespace { +/// Tree transform to "extract" a transformed type from a class template's +/// constructor to a deduction guide. +class ExtractTypeForDeductionGuide + : public TreeTransform { + llvm::SmallVectorImpl &MaterializedTypedefs; + ClassTemplateDecl *NestedPattern; + const MultiLevelTemplateArgumentList *OuterInstantiationArgs; + std::optional TypedefNameInstantiator; + +public: + typedef TreeTransform Base; + ExtractTypeForDeductionGuide( + Sema &SemaRef, + llvm::SmallVectorImpl &MaterializedTypedefs, + ClassTemplateDecl *NestedPattern, + const MultiLevelTemplateArgumentList *OuterInstantiationArgs) + : Base(SemaRef), MaterializedTypedefs(MaterializedTypedefs), + NestedPattern(NestedPattern), + OuterInstantiationArgs(OuterInstantiationArgs) { + if (OuterInstantiationArgs) + TypedefNameInstantiator.emplace( + SemaRef, SemaRef.getASTContext().getTranslationUnitDecl(), + *OuterInstantiationArgs); + } + + TypeSourceInfo *transform(TypeSourceInfo *TSI) { return TransformType(TSI); } + + /// Returns true if it's safe to substitute \p Typedef with + /// \p OuterInstantiationArgs. + bool mightReferToOuterTemplateParameters(TypedefNameDecl *Typedef) { + if (!NestedPattern) + return false; + + static auto WalkUp = [](DeclContext *DC, DeclContext *TargetDC) { + if (DC->Equals(TargetDC)) + return true; + while (DC->isRecord()) { + if (DC->Equals(TargetDC)) + return true; + DC = DC->getParent(); + } + return false; + }; + + if (WalkUp(Typedef->getDeclContext(), NestedPattern->getTemplatedDecl())) + return true; + if (WalkUp(NestedPattern->getTemplatedDecl(), Typedef->getDeclContext())) + return true; + return false; + } + + QualType + RebuildTemplateSpecializationType(TemplateName Template, + SourceLocation TemplateNameLoc, + TemplateArgumentListInfo &TemplateArgs) { + if (!OuterInstantiationArgs || + !isa_and_present(Template.getAsTemplateDecl())) + return Base::RebuildTemplateSpecializationType(Template, TemplateNameLoc, + TemplateArgs); + + auto *TATD = cast(Template.getAsTemplateDecl()); + auto *Pattern = TATD; + while (Pattern->getInstantiatedFromMemberTemplate()) + Pattern = Pattern->getInstantiatedFromMemberTemplate(); + if (!mightReferToOuterTemplateParameters(Pattern->getTemplatedDecl())) + return Base::RebuildTemplateSpecializationType(Template, TemplateNameLoc, + TemplateArgs); + + Decl *NewD = + TypedefNameInstantiator->InstantiateTypeAliasTemplateDecl(TATD); + if (!NewD) + return QualType(); + + auto *NewTATD = cast(NewD); + MaterializedTypedefs.push_back(NewTATD->getTemplatedDecl()); + + return Base::RebuildTemplateSpecializationType( + TemplateName(NewTATD), TemplateNameLoc, TemplateArgs); + } + + QualType TransformTypedefType(TypeLocBuilder &TLB, TypedefTypeLoc TL) { + ASTContext &Context = SemaRef.getASTContext(); + TypedefNameDecl *OrigDecl = TL.getTypedefNameDecl(); + TypedefNameDecl *Decl = OrigDecl; + // Transform the underlying type of the typedef and clone the Decl only if + // the typedef has a dependent context. + bool InDependentContext = OrigDecl->getDeclContext()->isDependentContext(); + + // A typedef/alias Decl within the NestedPattern may reference the outer + // template parameters. They're substituted with corresponding instantiation + // arguments here and in RebuildTemplateSpecializationType() above. + // Otherwise, we would have a CTAD guide with "dangling" template + // parameters. + // For example, + // template struct Outer { + // using Alias = S; + // template struct Inner { + // Inner(Alias); + // }; + // }; + if (OuterInstantiationArgs && InDependentContext && + TL.getTypePtr()->isInstantiationDependentType()) { + Decl = cast_if_present( + TypedefNameInstantiator->InstantiateTypedefNameDecl( + OrigDecl, /*IsTypeAlias=*/isa(OrigDecl))); + if (!Decl) + return QualType(); + MaterializedTypedefs.push_back(Decl); + } else if (InDependentContext) { + TypeLocBuilder InnerTLB; + QualType Transformed = + TransformType(InnerTLB, OrigDecl->getTypeSourceInfo()->getTypeLoc()); + TypeSourceInfo *TSI = InnerTLB.getTypeSourceInfo(Context, Transformed); + if (isa(OrigDecl)) + Decl = TypeAliasDecl::Create( + Context, Context.getTranslationUnitDecl(), OrigDecl->getBeginLoc(), + OrigDecl->getLocation(), OrigDecl->getIdentifier(), TSI); + else { + assert(isa(OrigDecl) && "Not a Type alias or typedef"); + Decl = TypedefDecl::Create( + Context, Context.getTranslationUnitDecl(), OrigDecl->getBeginLoc(), + OrigDecl->getLocation(), OrigDecl->getIdentifier(), TSI); + } + MaterializedTypedefs.push_back(Decl); + } + + QualType TDTy = Context.getTypedefType(Decl); + TypedefTypeLoc TypedefTL = TLB.push(TDTy); + TypedefTL.setNameLoc(TL.getNameLoc()); + + return TDTy; + } +}; + +// Build a deduction guide using the provided information. +// +// A deduction guide can be either a template or a non-template function +// declaration. If \p TemplateParams is null, a non-template function +// declaration will be created. +NamedDecl *buildDeductionGuide( + Sema &SemaRef, TemplateDecl *OriginalTemplate, + TemplateParameterList *TemplateParams, CXXConstructorDecl *Ctor, + ExplicitSpecifier ES, TypeSourceInfo *TInfo, SourceLocation LocStart, + SourceLocation Loc, SourceLocation LocEnd, bool IsImplicit, + llvm::ArrayRef MaterializedTypedefs = {}) { + DeclContext *DC = OriginalTemplate->getDeclContext(); + auto DeductionGuideName = + SemaRef.Context.DeclarationNames.getCXXDeductionGuideName( + OriginalTemplate); + + DeclarationNameInfo Name(DeductionGuideName, Loc); + ArrayRef Params = + TInfo->getTypeLoc().castAs().getParams(); + + // Build the implicit deduction guide template. + auto *Guide = + CXXDeductionGuideDecl::Create(SemaRef.Context, DC, LocStart, ES, Name, + TInfo->getType(), TInfo, LocEnd, Ctor); + Guide->setImplicit(IsImplicit); + Guide->setParams(Params); + + for (auto *Param : Params) + Param->setDeclContext(Guide); + for (auto *TD : MaterializedTypedefs) + TD->setDeclContext(Guide); + if (isa(DC)) + Guide->setAccess(AS_public); + + if (!TemplateParams) { + DC->addDecl(Guide); + return Guide; + } + + auto *GuideTemplate = FunctionTemplateDecl::Create( + SemaRef.Context, DC, Loc, DeductionGuideName, TemplateParams, Guide); + GuideTemplate->setImplicit(IsImplicit); + Guide->setDescribedFunctionTemplate(GuideTemplate); + + if (isa(DC)) + GuideTemplate->setAccess(AS_public); + + DC->addDecl(GuideTemplate); + return GuideTemplate; +} + +// Transform a given template type parameter `TTP`. +TemplateTypeParmDecl * +transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC, + TemplateTypeParmDecl *TTP, + MultiLevelTemplateArgumentList &Args, + unsigned NewDepth, unsigned NewIndex) { + // TemplateTypeParmDecl's index cannot be changed after creation, so + // substitute it directly. + auto *NewTTP = TemplateTypeParmDecl::Create( + SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(), NewDepth, + NewIndex, TTP->getIdentifier(), TTP->wasDeclaredWithTypename(), + TTP->isParameterPack(), TTP->hasTypeConstraint(), + TTP->isExpandedParameterPack() + ? std::optional(TTP->getNumExpansionParameters()) + : std::nullopt); + if (const auto *TC = TTP->getTypeConstraint()) + SemaRef.SubstTypeConstraint(NewTTP, TC, Args, + /*EvaluateConstraint=*/true); + if (TTP->hasDefaultArgument()) { + TemplateArgumentLoc InstantiatedDefaultArg; + if (!SemaRef.SubstTemplateArgument( + TTP->getDefaultArgument(), Args, InstantiatedDefaultArg, + TTP->getDefaultArgumentLoc(), TTP->getDeclName())) + NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg); + } + SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP); + return NewTTP; +} +// Similar to above, but for non-type template or template template parameters. +template +NonTypeTemplateOrTemplateTemplateParmDecl * +transformTemplateParam(Sema &SemaRef, DeclContext *DC, + NonTypeTemplateOrTemplateTemplateParmDecl *OldParam, + MultiLevelTemplateArgumentList &Args, unsigned NewIndex, + unsigned NewDepth) { + // Ask the template instantiator to do the heavy lifting for us, then adjust + // the index of the parameter once it's done. + auto *NewParam = cast( + SemaRef.SubstDecl(OldParam, DC, Args)); + NewParam->setPosition(NewIndex); + NewParam->setDepth(NewDepth); + return NewParam; +} + +/// Transform to convert portions of a constructor declaration into the +/// corresponding deduction guide, per C++1z [over.match.class.deduct]p1. +struct ConvertConstructorToDeductionGuideTransform { + ConvertConstructorToDeductionGuideTransform(Sema &S, + ClassTemplateDecl *Template) + : SemaRef(S), Template(Template) { + // If the template is nested, then we need to use the original + // pattern to iterate over the constructors. + ClassTemplateDecl *Pattern = Template; + while (Pattern->getInstantiatedFromMemberTemplate()) { + if (Pattern->isMemberSpecialization()) + break; + Pattern = Pattern->getInstantiatedFromMemberTemplate(); + NestedPattern = Pattern; + } + + if (NestedPattern) + OuterInstantiationArgs = SemaRef.getTemplateInstantiationArgs(Template); + } + + Sema &SemaRef; + ClassTemplateDecl *Template; + ClassTemplateDecl *NestedPattern = nullptr; + + DeclContext *DC = Template->getDeclContext(); + CXXRecordDecl *Primary = Template->getTemplatedDecl(); + DeclarationName DeductionGuideName = + SemaRef.Context.DeclarationNames.getCXXDeductionGuideName(Template); + + QualType DeducedType = SemaRef.Context.getTypeDeclType(Primary); + + // Index adjustment to apply to convert depth-1 template parameters into + // depth-0 template parameters. + unsigned Depth1IndexAdjustment = Template->getTemplateParameters()->size(); + + // Instantiation arguments for the outermost depth-1 templates + // when the template is nested + MultiLevelTemplateArgumentList OuterInstantiationArgs; + + /// Transform a constructor declaration into a deduction guide. + NamedDecl *transformConstructor(FunctionTemplateDecl *FTD, + CXXConstructorDecl *CD) { + SmallVector SubstArgs; + + LocalInstantiationScope Scope(SemaRef); + + // C++ [over.match.class.deduct]p1: + // -- For each constructor of the class template designated by the + // template-name, a function template with the following properties: + + // -- The template parameters are the template parameters of the class + // template followed by the template parameters (including default + // template arguments) of the constructor, if any. + TemplateParameterList *TemplateParams = + SemaRef.GetTemplateParameterList(Template); + if (FTD) { + TemplateParameterList *InnerParams = FTD->getTemplateParameters(); + SmallVector AllParams; + SmallVector Depth1Args; + AllParams.reserve(TemplateParams->size() + InnerParams->size()); + AllParams.insert(AllParams.begin(), TemplateParams->begin(), + TemplateParams->end()); + SubstArgs.reserve(InnerParams->size()); + Depth1Args.reserve(InnerParams->size()); + + // Later template parameters could refer to earlier ones, so build up + // a list of substituted template arguments as we go. + for (NamedDecl *Param : *InnerParams) { + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(Depth1Args); + Args.addOuterRetainedLevel(); + if (NestedPattern) + Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); + NamedDecl *NewParam = transformTemplateParameter(Param, Args); + if (!NewParam) + return nullptr; + // Constraints require that we substitute depth-1 arguments + // to match depths when substituted for evaluation later + Depth1Args.push_back(SemaRef.Context.getCanonicalTemplateArgument( + SemaRef.Context.getInjectedTemplateArg(NewParam))); + + if (NestedPattern) { + TemplateDeclInstantiator Instantiator(SemaRef, DC, + OuterInstantiationArgs); + Instantiator.setEvaluateConstraints(false); + SemaRef.runWithSufficientStackSpace(NewParam->getLocation(), [&] { + NewParam = cast(Instantiator.Visit(NewParam)); + }); + } + + assert(NewParam->getTemplateDepth() == 0 && + "Unexpected template parameter depth"); + + AllParams.push_back(NewParam); + SubstArgs.push_back(SemaRef.Context.getCanonicalTemplateArgument( + SemaRef.Context.getInjectedTemplateArg(NewParam))); + } + + // Substitute new template parameters into requires-clause if present. + Expr *RequiresClause = nullptr; + if (Expr *InnerRC = InnerParams->getRequiresClause()) { + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(Depth1Args); + Args.addOuterRetainedLevel(); + if (NestedPattern) + Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); + ExprResult E = SemaRef.SubstExpr(InnerRC, Args); + if (E.isInvalid()) + return nullptr; + RequiresClause = E.getAs(); + } + + TemplateParams = TemplateParameterList::Create( + SemaRef.Context, InnerParams->getTemplateLoc(), + InnerParams->getLAngleLoc(), AllParams, InnerParams->getRAngleLoc(), + RequiresClause); + } + + // If we built a new template-parameter-list, track that we need to + // substitute references to the old parameters into references to the + // new ones. + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + if (FTD) { + Args.addOuterTemplateArguments(SubstArgs); + Args.addOuterRetainedLevel(); + } + + FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo() + ->getTypeLoc() + .getAsAdjusted(); + assert(FPTL && "no prototype for constructor declaration"); + + // Transform the type of the function, adjusting the return type and + // replacing references to the old parameters with references to the + // new ones. + TypeLocBuilder TLB; + SmallVector Params; + SmallVector MaterializedTypedefs; + QualType NewType = transformFunctionProtoType(TLB, FPTL, Params, Args, + MaterializedTypedefs); + if (NewType.isNull()) + return nullptr; + TypeSourceInfo *NewTInfo = TLB.getTypeSourceInfo(SemaRef.Context, NewType); + + return buildDeductionGuide( + SemaRef, Template, TemplateParams, CD, CD->getExplicitSpecifier(), + NewTInfo, CD->getBeginLoc(), CD->getLocation(), CD->getEndLoc(), + /*IsImplicit=*/true, MaterializedTypedefs); + } + + /// Build a deduction guide with the specified parameter types. + NamedDecl *buildSimpleDeductionGuide(MutableArrayRef ParamTypes) { + SourceLocation Loc = Template->getLocation(); + + // Build the requested type. + FunctionProtoType::ExtProtoInfo EPI; + EPI.HasTrailingReturn = true; + QualType Result = SemaRef.BuildFunctionType(DeducedType, ParamTypes, Loc, + DeductionGuideName, EPI); + TypeSourceInfo *TSI = SemaRef.Context.getTrivialTypeSourceInfo(Result, Loc); + if (NestedPattern) + TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc, + DeductionGuideName); + + if (!TSI) + return nullptr; + + FunctionProtoTypeLoc FPTL = + TSI->getTypeLoc().castAs(); + + // Build the parameters, needed during deduction / substitution. + SmallVector Params; + for (auto T : ParamTypes) { + auto *TSI = SemaRef.Context.getTrivialTypeSourceInfo(T, Loc); + if (NestedPattern) + TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc, + DeclarationName()); + if (!TSI) + return nullptr; + + ParmVarDecl *NewParam = + ParmVarDecl::Create(SemaRef.Context, DC, Loc, Loc, nullptr, + TSI->getType(), TSI, SC_None, nullptr); + NewParam->setScopeInfo(0, Params.size()); + FPTL.setParam(Params.size(), NewParam); + Params.push_back(NewParam); + } + + return buildDeductionGuide( + SemaRef, Template, SemaRef.GetTemplateParameterList(Template), nullptr, + ExplicitSpecifier(), TSI, Loc, Loc, Loc, /*IsImplicit=*/true); + } + +private: + /// Transform a constructor template parameter into a deduction guide template + /// parameter, rebuilding any internal references to earlier parameters and + /// renumbering as we go. + NamedDecl *transformTemplateParameter(NamedDecl *TemplateParam, + MultiLevelTemplateArgumentList &Args) { + if (auto *TTP = dyn_cast(TemplateParam)) + return transformTemplateTypeParam( + SemaRef, DC, TTP, Args, TTP->getDepth() - 1, + Depth1IndexAdjustment + TTP->getIndex()); + if (auto *TTP = dyn_cast(TemplateParam)) + return transformTemplateParam(SemaRef, DC, TTP, Args, + Depth1IndexAdjustment + TTP->getIndex(), + TTP->getDepth() - 1); + auto *NTTP = cast(TemplateParam); + return transformTemplateParam(SemaRef, DC, NTTP, Args, + Depth1IndexAdjustment + NTTP->getIndex(), + NTTP->getDepth() - 1); + } + + QualType transformFunctionProtoType( + TypeLocBuilder &TLB, FunctionProtoTypeLoc TL, + SmallVectorImpl &Params, + MultiLevelTemplateArgumentList &Args, + SmallVectorImpl &MaterializedTypedefs) { + SmallVector ParamTypes; + const FunctionProtoType *T = TL.getTypePtr(); + + // -- The types of the function parameters are those of the constructor. + for (auto *OldParam : TL.getParams()) { + ParmVarDecl *NewParam = OldParam; + // Given + // template struct C { + // template struct D { + // template D(U, V); + // }; + // }; + // First, transform all the references to template parameters that are + // defined outside of the surrounding class template. That is T in the + // above example. + if (NestedPattern) { + NewParam = transformFunctionTypeParam( + NewParam, OuterInstantiationArgs, MaterializedTypedefs, + /*TransformingOuterPatterns=*/true); + if (!NewParam) + return QualType(); + } + // Then, transform all the references to template parameters that are + // defined at the class template and the constructor. In this example, + // they're U and V, respectively. + NewParam = + transformFunctionTypeParam(NewParam, Args, MaterializedTypedefs, + /*TransformingOuterPatterns=*/false); + if (!NewParam) + return QualType(); + ParamTypes.push_back(NewParam->getType()); + Params.push_back(NewParam); + } + + // -- The return type is the class template specialization designated by + // the template-name and template arguments corresponding to the + // template parameters obtained from the class template. + // + // We use the injected-class-name type of the primary template instead. + // This has the convenient property that it is different from any type that + // the user can write in a deduction-guide (because they cannot enter the + // context of the template), so implicit deduction guides can never collide + // with explicit ones. + QualType ReturnType = DeducedType; + TLB.pushTypeSpec(ReturnType).setNameLoc(Primary->getLocation()); + + // Resolving a wording defect, we also inherit the variadicness of the + // constructor. + FunctionProtoType::ExtProtoInfo EPI; + EPI.Variadic = T->isVariadic(); + EPI.HasTrailingReturn = true; + + QualType Result = SemaRef.BuildFunctionType( + ReturnType, ParamTypes, TL.getBeginLoc(), DeductionGuideName, EPI); + if (Result.isNull()) + return QualType(); + + FunctionProtoTypeLoc NewTL = TLB.push(Result); + NewTL.setLocalRangeBegin(TL.getLocalRangeBegin()); + NewTL.setLParenLoc(TL.getLParenLoc()); + NewTL.setRParenLoc(TL.getRParenLoc()); + NewTL.setExceptionSpecRange(SourceRange()); + NewTL.setLocalRangeEnd(TL.getLocalRangeEnd()); + for (unsigned I = 0, E = NewTL.getNumParams(); I != E; ++I) + NewTL.setParam(I, Params[I]); + + return Result; + } + + ParmVarDecl *transformFunctionTypeParam( + ParmVarDecl *OldParam, MultiLevelTemplateArgumentList &Args, + llvm::SmallVectorImpl &MaterializedTypedefs, + bool TransformingOuterPatterns) { + TypeSourceInfo *OldDI = OldParam->getTypeSourceInfo(); + TypeSourceInfo *NewDI; + if (auto PackTL = OldDI->getTypeLoc().getAs()) { + // Expand out the one and only element in each inner pack. + Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, 0); + NewDI = + SemaRef.SubstType(PackTL.getPatternLoc(), Args, + OldParam->getLocation(), OldParam->getDeclName()); + if (!NewDI) + return nullptr; + NewDI = + SemaRef.CheckPackExpansion(NewDI, PackTL.getEllipsisLoc(), + PackTL.getTypePtr()->getNumExpansions()); + } else + NewDI = SemaRef.SubstType(OldDI, Args, OldParam->getLocation(), + OldParam->getDeclName()); + if (!NewDI) + return nullptr; + + // Extract the type. This (for instance) replaces references to typedef + // members of the current instantiations with the definitions of those + // typedefs, avoiding triggering instantiation of the deduced type during + // deduction. + NewDI = ExtractTypeForDeductionGuide( + SemaRef, MaterializedTypedefs, NestedPattern, + TransformingOuterPatterns ? &Args : nullptr) + .transform(NewDI); + + // Resolving a wording defect, we also inherit default arguments from the + // constructor. + ExprResult NewDefArg; + if (OldParam->hasDefaultArg()) { + // We don't care what the value is (we won't use it); just create a + // placeholder to indicate there is a default argument. + QualType ParamTy = NewDI->getType(); + NewDefArg = new (SemaRef.Context) + OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(), + ParamTy.getNonLValueExprType(SemaRef.Context), + ParamTy->isLValueReferenceType() ? VK_LValue + : ParamTy->isRValueReferenceType() ? VK_XValue + : VK_PRValue); + } + // Handle arrays and functions decay. + auto NewType = NewDI->getType(); + if (NewType->isArrayType() || NewType->isFunctionType()) + NewType = SemaRef.Context.getDecayedType(NewType); + + ParmVarDecl *NewParam = ParmVarDecl::Create( + SemaRef.Context, DC, OldParam->getInnerLocStart(), + OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewDI, + OldParam->getStorageClass(), NewDefArg.get()); + NewParam->setScopeInfo(OldParam->getFunctionScopeDepth(), + OldParam->getFunctionScopeIndex()); + SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldParam, NewParam); + return NewParam; + } +}; + +unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) { + if (auto *TTP = dyn_cast(TemplateParam)) + return TTP->getDepth(); + if (auto *TTP = dyn_cast(TemplateParam)) + return TTP->getDepth(); + if (auto *NTTP = dyn_cast(TemplateParam)) + return NTTP->getDepth(); + llvm_unreachable("Unhandled template parameter types"); +} + +unsigned getTemplateParameterIndex(NamedDecl *TemplateParam) { + if (auto *TTP = dyn_cast(TemplateParam)) + return TTP->getIndex(); + if (auto *TTP = dyn_cast(TemplateParam)) + return TTP->getIndex(); + if (auto *NTTP = dyn_cast(TemplateParam)) + return NTTP->getIndex(); + llvm_unreachable("Unhandled template parameter types"); +} + +// Find all template parameters that appear in the given DeducedArgs. +// Return the indices of the template parameters in the TemplateParams. +SmallVector TemplateParamsReferencedInTemplateArgumentList( + const TemplateParameterList *TemplateParamsList, + ArrayRef DeducedArgs) { + struct TemplateParamsReferencedFinder + : public RecursiveASTVisitor { + const TemplateParameterList *TemplateParamList; + llvm::BitVector ReferencedTemplateParams; + + TemplateParamsReferencedFinder( + const TemplateParameterList *TemplateParamList) + : TemplateParamList(TemplateParamList), + ReferencedTemplateParams(TemplateParamList->size()) {} + + bool VisitTemplateTypeParmType(TemplateTypeParmType *TTP) { + // We use the index and depth to retrieve the corresponding template + // parameter from the parameter list, which is more robost. + Mark(TTP->getDepth(), TTP->getIndex()); + return true; + } + + bool VisitDeclRefExpr(DeclRefExpr *DRE) { + MarkAppeared(DRE->getFoundDecl()); + return true; + } + + bool TraverseTemplateName(TemplateName Template) { + if (auto *TD = Template.getAsTemplateDecl()) + MarkAppeared(TD); + return RecursiveASTVisitor::TraverseTemplateName(Template); + } + + void MarkAppeared(NamedDecl *ND) { + if (llvm::isa(ND)) + Mark(getTemplateParameterDepth(ND), getTemplateParameterIndex(ND)); + } + void Mark(unsigned Depth, unsigned Index) { + if (Index < TemplateParamList->size() && + TemplateParamList->getParam(Index)->getTemplateDepth() == Depth) + ReferencedTemplateParams.set(Index); + } + }; + TemplateParamsReferencedFinder Finder(TemplateParamsList); + Finder.TraverseTemplateArguments(DeducedArgs); + + SmallVector Results; + for (unsigned Index = 0; Index < TemplateParamsList->size(); ++Index) { + if (Finder.ReferencedTemplateParams[Index]) + Results.push_back(Index); + } + return Results; +} + +bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) { + // Check whether we've already declared deduction guides for this template. + // FIXME: Consider storing a flag on the template to indicate this. + assert(Name.getNameKind() == + DeclarationName::NameKind::CXXDeductionGuideName && + "name must be a deduction guide name"); + auto Existing = DC->lookup(Name); + for (auto *D : Existing) + if (D->isImplicit()) + return true; + return false; +} + +NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC, + NamedDecl *TemplateParam, + MultiLevelTemplateArgumentList &Args, + unsigned NewIndex, unsigned NewDepth) { + if (auto *TTP = dyn_cast(TemplateParam)) + return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth, + NewIndex); + if (auto *TTP = dyn_cast(TemplateParam)) + return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth); + if (auto *NTTP = dyn_cast(TemplateParam)) + return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth); + llvm_unreachable("Unhandled template parameter types"); +} + +// Build the associated constraints for the alias deduction guides. +// C++ [over.match.class.deduct]p3.3: +// The associated constraints ([temp.constr.decl]) are the conjunction of the +// associated constraints of g and a constraint that is satisfied if and only +// if the arguments of A are deducible (see below) from the return type. +// +// The return result is expected to be the require-clause for the synthesized +// alias deduction guide. +Expr * +buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F, + TypeAliasTemplateDecl *AliasTemplate, + ArrayRef DeduceResults, + unsigned FirstUndeducedParamIdx, Expr *IsDeducible) { + Expr *RC = F->getTemplateParameters()->getRequiresClause(); + if (!RC) + return IsDeducible; + + ASTContext &Context = SemaRef.Context; + LocalInstantiationScope Scope(SemaRef); + + // In the clang AST, constraint nodes are deliberately not instantiated unless + // they are actively being evaluated. Consequently, occurrences of template + // parameters in the require-clause expression have a subtle "depth" + // difference compared to normal occurrences in places, such as function + // parameters. When transforming the require-clause, we must take this + // distinction into account: + // + // 1) In the transformed require-clause, occurrences of template parameters + // must use the "uninstantiated" depth; + // 2) When substituting on the require-clause expr of the underlying + // deduction guide, we must use the entire set of template argument lists; + // + // It's important to note that we're performing this transformation on an + // *instantiated* AliasTemplate. + + // For 1), if the alias template is nested within a class template, we + // calcualte the 'uninstantiated' depth by adding the substitution level back. + unsigned AdjustDepth = 0; + if (auto *PrimaryTemplate = + AliasTemplate->getInstantiatedFromMemberTemplate()) + AdjustDepth = PrimaryTemplate->getTemplateDepth(); + + // We rebuild all template parameters with the uninstantiated depth, and + // build template arguments refer to them. + SmallVector AdjustedAliasTemplateArgs; + + for (auto *TP : *AliasTemplate->getTemplateParameters()) { + // Rebuild any internal references to earlier parameters and reindex + // as we go. + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs); + NamedDecl *NewParam = transformTemplateParameter( + SemaRef, AliasTemplate->getDeclContext(), TP, Args, + /*NewIndex=*/AdjustedAliasTemplateArgs.size(), + getTemplateParameterDepth(TP) + AdjustDepth); + + auto NewTemplateArgument = Context.getCanonicalTemplateArgument( + Context.getInjectedTemplateArg(NewParam)); + AdjustedAliasTemplateArgs.push_back(NewTemplateArgument); + } + // Template arguments used to transform the template arguments in + // DeducedResults. + SmallVector TemplateArgsForBuildingRC( + F->getTemplateParameters()->size()); + // Transform the transformed template args + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs); + + for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { + const auto &D = DeduceResults[Index]; + if (D.isNull()) { // non-deduced template parameters of f + NamedDecl *TP = F->getTemplateParameters()->getParam(Index); + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(TemplateArgsForBuildingRC); + // Rebuild the template parameter with updated depth and index. + NamedDecl *NewParam = transformTemplateParameter( + SemaRef, F->getDeclContext(), TP, Args, + /*NewIndex=*/FirstUndeducedParamIdx, + getTemplateParameterDepth(TP) + AdjustDepth); + FirstUndeducedParamIdx += 1; + assert(TemplateArgsForBuildingRC[Index].isNull()); + TemplateArgsForBuildingRC[Index] = Context.getCanonicalTemplateArgument( + Context.getInjectedTemplateArg(NewParam)); + continue; + } + TemplateArgumentLoc Input = + SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); + TemplateArgumentLoc Output; + if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { + assert(TemplateArgsForBuildingRC[Index].isNull() && + "InstantiatedArgs must be null before setting"); + TemplateArgsForBuildingRC[Index] = Output.getArgument(); + } + } + + // A list of template arguments for transforming the require-clause of F. + // It must contain the entire set of template argument lists. + MultiLevelTemplateArgumentList ArgsForBuildingRC; + ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite); + ArgsForBuildingRC.addOuterTemplateArguments(TemplateArgsForBuildingRC); + // For 2), if the underlying deduction guide F is nested in a class template, + // we need the entire template argument list, as the constraint AST in the + // require-clause of F remains completely uninstantiated. + // + // For example: + // template // depth 0 + // struct Outer { + // template + // struct Foo { Foo(U); }; + // + // template // depth 1 + // requires C + // Foo(U) -> Foo; + // }; + // template + // using AFoo = Outer::Foo; + // + // In this scenario, the deduction guide for `Foo` inside `Outer`: + // - The occurrence of U in the require-expression is [depth:1, index:0] + // - The occurrence of U in the function parameter is [depth:0, index:0] + // - The template parameter of U is [depth:0, index:0] + // + // We add the outer template arguments which is [int] to the multi-level arg + // list to ensure that the occurrence U in `C` will be replaced with int + // during the substitution. + // + // NOTE: The underlying deduction guide F is instantiated -- either from an + // explicitly-written deduction guide member, or from a constructor. + // getInstantiatedFromMemberTemplate() can only handle the former case, so we + // check the DeclContext kind. + if (F->getLexicalDeclContext()->getDeclKind() == + clang::Decl::ClassTemplateSpecialization) { + auto OuterLevelArgs = SemaRef.getTemplateInstantiationArgs( + F, F->getLexicalDeclContext(), + /*Final=*/false, /*Innermost=*/std::nullopt, + /*RelativeToPrimary=*/true, + /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/true); + for (auto It : OuterLevelArgs) + ArgsForBuildingRC.addOuterTemplateArguments(It.Args); + } + + ExprResult E = SemaRef.SubstExpr(RC, ArgsForBuildingRC); + if (E.isInvalid()) + return nullptr; + + auto Conjunction = + SemaRef.BuildBinOp(SemaRef.getCurScope(), SourceLocation{}, + BinaryOperatorKind::BO_LAnd, E.get(), IsDeducible); + if (Conjunction.isInvalid()) + return nullptr; + return Conjunction.getAs(); +} +// Build the is_deducible constraint for the alias deduction guides. +// [over.match.class.deduct]p3.3: +// ... and a constraint that is satisfied if and only if the arguments +// of A are deducible (see below) from the return type. +Expr *buildIsDeducibleConstraint(Sema &SemaRef, + TypeAliasTemplateDecl *AliasTemplate, + QualType ReturnType, + SmallVector TemplateParams) { + ASTContext &Context = SemaRef.Context; + // Constraint AST nodes must use uninstantiated depth. + if (auto *PrimaryTemplate = + AliasTemplate->getInstantiatedFromMemberTemplate(); + PrimaryTemplate && TemplateParams.size() > 0) { + LocalInstantiationScope Scope(SemaRef); + + // Adjust the depth for TemplateParams. + unsigned AdjustDepth = PrimaryTemplate->getTemplateDepth(); + SmallVector TransformedTemplateArgs; + for (auto *TP : TemplateParams) { + // Rebuild any internal references to earlier parameters and reindex + // as we go. + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(TransformedTemplateArgs); + NamedDecl *NewParam = transformTemplateParameter( + SemaRef, AliasTemplate->getDeclContext(), TP, Args, + /*NewIndex=*/TransformedTemplateArgs.size(), + getTemplateParameterDepth(TP) + AdjustDepth); + + auto NewTemplateArgument = Context.getCanonicalTemplateArgument( + Context.getInjectedTemplateArg(NewParam)); + TransformedTemplateArgs.push_back(NewTemplateArgument); + } + // Transformed the ReturnType to restore the uninstantiated depth. + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(TransformedTemplateArgs); + ReturnType = SemaRef.SubstType( + ReturnType, Args, AliasTemplate->getLocation(), + Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate)); + }; + + SmallVector IsDeducibleTypeTraitArgs = { + Context.getTrivialTypeSourceInfo( + Context.getDeducedTemplateSpecializationType( + TemplateName(AliasTemplate), /*DeducedType=*/QualType(), + /*IsDependent=*/true)), // template specialization type whose + // arguments will be deduced. + Context.getTrivialTypeSourceInfo( + ReturnType), // type from which template arguments are deduced. + }; + return TypeTraitExpr::Create( + Context, Context.getLogicalOperationType(), AliasTemplate->getLocation(), + TypeTrait::BTT_IsDeducible, IsDeducibleTypeTraitArgs, + AliasTemplate->getLocation(), /*Value*/ false); +} + +std::pair> +getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) { + // Unwrap the sugared ElaboratedType. + auto RhsType = AliasTemplate->getTemplatedDecl() + ->getUnderlyingType() + .getSingleStepDesugaredType(SemaRef.Context); + TemplateDecl *Template = nullptr; + llvm::ArrayRef AliasRhsTemplateArgs; + if (const auto *TST = RhsType->getAs()) { + // Cases where the RHS of the alias is dependent. e.g. + // template + // using AliasFoo1 = Foo; // a class/type alias template specialization + Template = TST->getTemplateName().getAsTemplateDecl(); + AliasRhsTemplateArgs = TST->template_arguments(); + } else if (const auto *RT = RhsType->getAs()) { + // Cases where template arguments in the RHS of the alias are not + // dependent. e.g. + // using AliasFoo = Foo; + if (const auto *CTSD = llvm::dyn_cast( + RT->getAsCXXRecordDecl())) { + Template = CTSD->getSpecializedTemplate(); + AliasRhsTemplateArgs = CTSD->getTemplateArgs().asArray(); + } + } else { + assert(false && "unhandled RHS type of the alias"); + } + return {Template, AliasRhsTemplateArgs}; +} + +// Build deduction guides for a type alias template from the given underlying +// deduction guide F. +FunctionTemplateDecl * +BuildDeductionGuideForTypeAlias(Sema &SemaRef, + TypeAliasTemplateDecl *AliasTemplate, + FunctionTemplateDecl *F, SourceLocation Loc) { + LocalInstantiationScope Scope(SemaRef); + Sema::InstantiatingTemplate BuildingDeductionGuides( + SemaRef, AliasTemplate->getLocation(), F, + Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); + if (BuildingDeductionGuides.isInvalid()) + return nullptr; + + auto &Context = SemaRef.Context; + auto [Template, AliasRhsTemplateArgs] = + getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate); + + auto RType = F->getTemplatedDecl()->getReturnType(); + // The (trailing) return type of the deduction guide. + const TemplateSpecializationType *FReturnType = + RType->getAs(); + if (const auto *InjectedCNT = RType->getAs()) + // implicitly-generated deduction guide. + FReturnType = InjectedCNT->getInjectedTST(); + else if (const auto *ET = RType->getAs()) + // explicit deduction guide. + FReturnType = ET->getNamedType()->getAs(); + assert(FReturnType && "expected to see a return type"); + // Deduce template arguments of the deduction guide f from the RHS of + // the alias. + // + // C++ [over.match.class.deduct]p3: ...For each function or function + // template f in the guides of the template named by the + // simple-template-id of the defining-type-id, the template arguments + // of the return type of f are deduced from the defining-type-id of A + // according to the process in [temp.deduct.type] with the exception + // that deduction does not fail if not all template arguments are + // deduced. + // + // + // template + // f(X, Y) -> f; + // + // template + // using alias = f; + // + // The RHS of alias is f, we deduced the template arguments of + // the return type of the deduction guide from it: Y->int, X->U + sema::TemplateDeductionInfo TDeduceInfo(Loc); + // Must initialize n elements, this is required by DeduceTemplateArguments. + SmallVector DeduceResults( + F->getTemplateParameters()->size()); + + // FIXME: DeduceTemplateArguments stops immediately at the first + // non-deducible template argument. However, this doesn't seem to casue + // issues for practice cases, we probably need to extend it to continue + // performing deduction for rest of arguments to align with the C++ + // standard. + SemaRef.DeduceTemplateArguments( + F->getTemplateParameters(), FReturnType->template_arguments(), + AliasRhsTemplateArgs, TDeduceInfo, DeduceResults, + /*NumberOfArgumentsMustMatch=*/false); + + SmallVector DeducedArgs; + SmallVector NonDeducedTemplateParamsInFIndex; + // !!NOTE: DeduceResults respects the sequence of template parameters of + // the deduction guide f. + for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { + if (const auto &D = DeduceResults[Index]; !D.isNull()) // Deduced + DeducedArgs.push_back(D); + else + NonDeducedTemplateParamsInFIndex.push_back(Index); + } + auto DeducedAliasTemplateParams = + TemplateParamsReferencedInTemplateArgumentList( + AliasTemplate->getTemplateParameters(), DeducedArgs); + // All template arguments null by default. + SmallVector TemplateArgsForBuildingFPrime( + F->getTemplateParameters()->size()); + + // Create a template parameter list for the synthesized deduction guide f'. + // + // C++ [over.match.class.deduct]p3.2: + // If f is a function template, f' is a function template whose template + // parameter list consists of all the template parameters of A + // (including their default template arguments) that appear in the above + // deductions or (recursively) in their default template arguments + SmallVector FPrimeTemplateParams; + // Store template arguments that refer to the newly-created template + // parameters, used for building `TemplateArgsForBuildingFPrime`. + SmallVector TransformedDeducedAliasArgs( + AliasTemplate->getTemplateParameters()->size()); + + for (unsigned AliasTemplateParamIdx : DeducedAliasTemplateParams) { + auto *TP = + AliasTemplate->getTemplateParameters()->getParam(AliasTemplateParamIdx); + // Rebuild any internal references to earlier parameters and reindex as + // we go. + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(TransformedDeducedAliasArgs); + NamedDecl *NewParam = transformTemplateParameter( + SemaRef, AliasTemplate->getDeclContext(), TP, Args, + /*NewIndex=*/FPrimeTemplateParams.size(), + getTemplateParameterDepth(TP)); + FPrimeTemplateParams.push_back(NewParam); + + auto NewTemplateArgument = Context.getCanonicalTemplateArgument( + Context.getInjectedTemplateArg(NewParam)); + TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument; + } + unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size(); + // ...followed by the template parameters of f that were not deduced + // (including their default template arguments) + for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) { + auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx); + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + // We take a shortcut here, it is ok to reuse the + // TemplateArgsForBuildingFPrime. + Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime); + NamedDecl *NewParam = transformTemplateParameter( + SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(), + getTemplateParameterDepth(TP)); + FPrimeTemplateParams.push_back(NewParam); + + assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() && + "The argument must be null before setting"); + TemplateArgsForBuildingFPrime[FTemplateParamIdx] = + Context.getCanonicalTemplateArgument( + Context.getInjectedTemplateArg(NewParam)); + } + + // To form a deduction guide f' from f, we leverage clang's instantiation + // mechanism, we construct a template argument list where the template + // arguments refer to the newly-created template parameters of f', and + // then apply instantiation on this template argument list to instantiate + // f, this ensures all template parameter occurrences are updated + // correctly. + // + // The template argument list is formed from the `DeducedArgs`, two parts: + // 1) appeared template parameters of alias: transfrom the deduced + // template argument; + // 2) non-deduced template parameters of f: rebuild a + // template argument; + // + // 2) has been built already (when rebuilding the new template + // parameters), we now perform 1). + MultiLevelTemplateArgumentList Args; + Args.setKind(TemplateSubstitutionKind::Rewrite); + Args.addOuterTemplateArguments(TransformedDeducedAliasArgs); + for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) { + const auto &D = DeduceResults[Index]; + if (D.isNull()) { + // 2): Non-deduced template parameter has been built already. + assert(!TemplateArgsForBuildingFPrime[Index].isNull() && + "template arguments for non-deduced template parameters should " + "be been set!"); + continue; + } + TemplateArgumentLoc Input = + SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{}); + TemplateArgumentLoc Output; + if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) { + assert(TemplateArgsForBuildingFPrime[Index].isNull() && + "InstantiatedArgs must be null before setting"); + TemplateArgsForBuildingFPrime[Index] = Output.getArgument(); + } + } + + auto *TemplateArgListForBuildingFPrime = + TemplateArgumentList::CreateCopy(Context, TemplateArgsForBuildingFPrime); + // Form the f' by substituting the template arguments into f. + if (auto *FPrime = SemaRef.InstantiateFunctionDeclaration( + F, TemplateArgListForBuildingFPrime, AliasTemplate->getLocation(), + Sema::CodeSynthesisContext::BuildingDeductionGuides)) { + auto *GG = cast(FPrime); + + Expr *IsDeducible = buildIsDeducibleConstraint( + SemaRef, AliasTemplate, FPrime->getReturnType(), FPrimeTemplateParams); + Expr *RequiresClause = + buildAssociatedConstraints(SemaRef, F, AliasTemplate, DeduceResults, + FirstUndeducedParamIdx, IsDeducible); + + auto *FPrimeTemplateParamList = TemplateParameterList::Create( + Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(), + AliasTemplate->getTemplateParameters()->getLAngleLoc(), + FPrimeTemplateParams, + AliasTemplate->getTemplateParameters()->getRAngleLoc(), + /*RequiresClause=*/RequiresClause); + auto *Result = cast(buildDeductionGuide( + SemaRef, AliasTemplate, FPrimeTemplateParamList, + GG->getCorrespondingConstructor(), GG->getExplicitSpecifier(), + GG->getTypeSourceInfo(), AliasTemplate->getBeginLoc(), + AliasTemplate->getLocation(), AliasTemplate->getEndLoc(), + F->isImplicit())); + cast(Result->getTemplatedDecl()) + ->setDeductionCandidateKind(GG->getDeductionCandidateKind()); + return Result; + } + return nullptr; +} + +void DeclareImplicitDeductionGuidesForTypeAlias( + Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate, SourceLocation Loc) { + if (AliasTemplate->isInvalidDecl()) + return; + auto &Context = SemaRef.Context; + // FIXME: if there is an explicit deduction guide after the first use of the + // type alias usage, we will not cover this explicit deduction guide. fix this + // case. + if (hasDeclaredDeductionGuides( + Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate), + AliasTemplate->getDeclContext())) + return; + auto [Template, AliasRhsTemplateArgs] = + getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate); + if (!Template) + return; + DeclarationNameInfo NameInfo( + Context.DeclarationNames.getCXXDeductionGuideName(Template), Loc); + LookupResult Guides(SemaRef, NameInfo, clang::Sema::LookupOrdinaryName); + SemaRef.LookupQualifiedName(Guides, Template->getDeclContext()); + Guides.suppressDiagnostics(); + + for (auto *G : Guides) { + if (auto *DG = dyn_cast(G)) { + // The deduction guide is a non-template function decl, we just clone it. + auto *FunctionType = + SemaRef.Context.getTrivialTypeSourceInfo(DG->getType()); + FunctionProtoTypeLoc FPTL = + FunctionType->getTypeLoc().castAs(); + + // Clone the parameters. + for (unsigned I = 0, N = DG->getNumParams(); I != N; ++I) { + const auto *P = DG->getParamDecl(I); + auto *TSI = SemaRef.Context.getTrivialTypeSourceInfo(P->getType()); + ParmVarDecl *NewParam = ParmVarDecl::Create( + SemaRef.Context, G->getDeclContext(), + DG->getParamDecl(I)->getBeginLoc(), P->getLocation(), nullptr, + TSI->getType(), TSI, SC_None, nullptr); + NewParam->setScopeInfo(0, I); + FPTL.setParam(I, NewParam); + } + auto *Transformed = cast(buildDeductionGuide( + SemaRef, AliasTemplate, /*TemplateParams=*/nullptr, + /*Constructor=*/nullptr, DG->getExplicitSpecifier(), FunctionType, + AliasTemplate->getBeginLoc(), AliasTemplate->getLocation(), + AliasTemplate->getEndLoc(), DG->isImplicit())); + + // FIXME: Here the synthesized deduction guide is not a templated + // function. Per [dcl.decl]p4, the requires-clause shall be present only + // if the declarator declares a templated function, a bug in standard? + auto *Constraint = buildIsDeducibleConstraint( + SemaRef, AliasTemplate, Transformed->getReturnType(), {}); + if (auto *RC = DG->getTrailingRequiresClause()) { + auto Conjunction = + SemaRef.BuildBinOp(SemaRef.getCurScope(), SourceLocation{}, + BinaryOperatorKind::BO_LAnd, RC, Constraint); + if (!Conjunction.isInvalid()) + Constraint = Conjunction.getAs(); + } + Transformed->setTrailingRequiresClause(Constraint); + } + FunctionTemplateDecl *F = dyn_cast(G); + if (!F) + continue; + // The **aggregate** deduction guides are handled in a different code path + // (DeclareAggregateDeductionGuideFromInitList), which involves the tricky + // cache. + if (cast(F->getTemplatedDecl()) + ->getDeductionCandidateKind() == DeductionCandidate::Aggregate) + continue; + + BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, F, Loc); + } +} + +// Build an aggregate deduction guide for a type alias template. +FunctionTemplateDecl *DeclareAggregateDeductionGuideForTypeAlias( + Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate, + MutableArrayRef ParamTypes, SourceLocation Loc) { + TemplateDecl *RHSTemplate = + getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate).first; + if (!RHSTemplate) + return nullptr; + auto *RHSDeductionGuide = SemaRef.DeclareAggregateDeductionGuideFromInitList( + RHSTemplate, ParamTypes, Loc); + if (!RHSDeductionGuide) + return nullptr; + return BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, + RHSDeductionGuide, Loc); +} + +} // namespace + +FunctionTemplateDecl *Sema::DeclareAggregateDeductionGuideFromInitList( + TemplateDecl *Template, MutableArrayRef ParamTypes, + SourceLocation Loc) { + llvm::FoldingSetNodeID ID; + ID.AddPointer(Template); + for (auto &T : ParamTypes) + T.getCanonicalType().Profile(ID); + unsigned Hash = ID.ComputeHash(); + + auto Found = AggregateDeductionCandidates.find(Hash); + if (Found != AggregateDeductionCandidates.end()) { + CXXDeductionGuideDecl *GD = Found->getSecond(); + return GD->getDescribedFunctionTemplate(); + } + + if (auto *AliasTemplate = llvm::dyn_cast(Template)) { + if (auto *FTD = DeclareAggregateDeductionGuideForTypeAlias( + *this, AliasTemplate, ParamTypes, Loc)) { + auto *GD = cast(FTD->getTemplatedDecl()); + GD->setDeductionCandidateKind(DeductionCandidate::Aggregate); + AggregateDeductionCandidates[Hash] = GD; + return FTD; + } + } + + if (CXXRecordDecl *DefRecord = + cast(Template->getTemplatedDecl())->getDefinition()) { + if (TemplateDecl *DescribedTemplate = + DefRecord->getDescribedClassTemplate()) + Template = DescribedTemplate; + } + + DeclContext *DC = Template->getDeclContext(); + if (DC->isDependentContext()) + return nullptr; + + ConvertConstructorToDeductionGuideTransform Transform( + *this, cast(Template)); + if (!isCompleteType(Loc, Transform.DeducedType)) + return nullptr; + + // In case we were expanding a pack when we attempted to declare deduction + // guides, turn off pack expansion for everything we're about to do. + ArgumentPackSubstitutionIndexRAII SubstIndex(*this, + /*NewSubstitutionIndex=*/-1); + // Create a template instantiation record to track the "instantiation" of + // constructors into deduction guides. + InstantiatingTemplate BuildingDeductionGuides( + *this, Loc, Template, + Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); + if (BuildingDeductionGuides.isInvalid()) + return nullptr; + + ClassTemplateDecl *Pattern = + Transform.NestedPattern ? Transform.NestedPattern : Transform.Template; + ContextRAII SavedContext(*this, Pattern->getTemplatedDecl()); + + auto *FTD = cast( + Transform.buildSimpleDeductionGuide(ParamTypes)); + SavedContext.pop(); + auto *GD = cast(FTD->getTemplatedDecl()); + GD->setDeductionCandidateKind(DeductionCandidate::Aggregate); + AggregateDeductionCandidates[Hash] = GD; + return FTD; +} + +void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template, + SourceLocation Loc) { + if (auto *AliasTemplate = llvm::dyn_cast(Template)) { + DeclareImplicitDeductionGuidesForTypeAlias(*this, AliasTemplate, Loc); + return; + } + if (CXXRecordDecl *DefRecord = + cast(Template->getTemplatedDecl())->getDefinition()) { + if (TemplateDecl *DescribedTemplate = + DefRecord->getDescribedClassTemplate()) + Template = DescribedTemplate; + } + + DeclContext *DC = Template->getDeclContext(); + if (DC->isDependentContext()) + return; + + ConvertConstructorToDeductionGuideTransform Transform( + *this, cast(Template)); + if (!isCompleteType(Loc, Transform.DeducedType)) + return; + + if (hasDeclaredDeductionGuides(Transform.DeductionGuideName, DC)) + return; + + // In case we were expanding a pack when we attempted to declare deduction + // guides, turn off pack expansion for everything we're about to do. + ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1); + // Create a template instantiation record to track the "instantiation" of + // constructors into deduction guides. + InstantiatingTemplate BuildingDeductionGuides( + *this, Loc, Template, + Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{}); + if (BuildingDeductionGuides.isInvalid()) + return; + + // Convert declared constructors into deduction guide templates. + // FIXME: Skip constructors for which deduction must necessarily fail (those + // for which some class template parameter without a default argument never + // appears in a deduced context). + ClassTemplateDecl *Pattern = + Transform.NestedPattern ? Transform.NestedPattern : Transform.Template; + ContextRAII SavedContext(*this, Pattern->getTemplatedDecl()); + llvm::SmallPtrSet ProcessedCtors; + bool AddedAny = false; + for (NamedDecl *D : LookupConstructors(Pattern->getTemplatedDecl())) { + D = D->getUnderlyingDecl(); + if (D->isInvalidDecl() || D->isImplicit()) + continue; + + D = cast(D->getCanonicalDecl()); + + // Within C++20 modules, we may have multiple same constructors in + // multiple same RecordDecls. And it doesn't make sense to create + // duplicated deduction guides for the duplicated constructors. + if (ProcessedCtors.count(D)) + continue; + + auto *FTD = dyn_cast(D); + auto *CD = + dyn_cast_or_null(FTD ? FTD->getTemplatedDecl() : D); + // Class-scope explicit specializations (MS extension) do not result in + // deduction guides. + if (!CD || (!FTD && CD->isFunctionTemplateSpecialization())) + continue; + + // Cannot make a deduction guide when unparsed arguments are present. + if (llvm::any_of(CD->parameters(), [](ParmVarDecl *P) { + return !P || P->hasUnparsedDefaultArg(); + })) + continue; + + ProcessedCtors.insert(D); + Transform.transformConstructor(FTD, CD); + AddedAny = true; + } + + // C++17 [over.match.class.deduct] + // -- If C is not defined or does not declare any constructors, an + // additional function template derived as above from a hypothetical + // constructor C(). + if (!AddedAny) + Transform.buildSimpleDeductionGuide(std::nullopt); + + // -- An additional function template derived as above from a hypothetical + // constructor C(C), called the copy deduction candidate. + cast( + cast( + Transform.buildSimpleDeductionGuide(Transform.DeducedType)) + ->getTemplatedDecl()) + ->setDeductionCandidateKind(DeductionCandidate::Copy); + + SavedContext.pop(); +} diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 64c5a16b2e4c38..a7bc6749c58520 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1515,11 +1515,12 @@ namespace { NestedNameSpecifierLoc QualifierLoc, QualType T); - TemplateName TransformTemplateName(CXXScopeSpec &SS, TemplateName Name, - SourceLocation NameLoc, - QualType ObjectType = QualType(), - bool AllowInjectedClassName = false, - bool MayBeNNS = false); + TemplateName + TransformTemplateName(CXXScopeSpec &SS, TemplateName Name, + SourceLocation NameLoc, + QualType ObjectType = QualType(), + NamedDecl *FirstQualifierInScope = nullptr, + bool AllowInjectedClassName = false); const CXXAssumeAttr *TransformCXXAssumeAttr(const CXXAssumeAttr *AA); const LoopHintAttr *TransformLoopHintAttr(const LoopHintAttr *LH); @@ -1951,7 +1952,8 @@ TemplateInstantiator::RebuildElaboratedType(SourceLocation KeywordLoc, TemplateName TemplateInstantiator::TransformTemplateName( CXXScopeSpec &SS, TemplateName Name, SourceLocation NameLoc, - QualType ObjectType, bool AllowInjectedClassName, bool MayBeNNS) { + QualType ObjectType, NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName) { if (TemplateTemplateParmDecl *TTP = dyn_cast_or_null(Name.getAsTemplateDecl())) { if (TTP->getDepth() < TemplateArgs.getNumLevels()) { @@ -2023,7 +2025,8 @@ TemplateName TemplateInstantiator::TransformTemplateName( } return inherited::TransformTemplateName(SS, Name, NameLoc, ObjectType, - AllowInjectedClassName, MayBeNNS); + FirstQualifierInScope, + AllowInjectedClassName); } ExprResult diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 714409f927a8d5..baac1fe4f24079 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2373,7 +2373,7 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, // on bitvectors, and we have no well-defined ABI for bitvectors, so vectors // of bool aren't allowed. // - // We explictly allow bool elements in ext_vector_type for C/C++. + // We explicitly allow bool elements in ext_vector_type for C/C++. bool IsNoBoolVecLang = getLangOpts().OpenCL || getLangOpts().OpenCLCPlusPlus; if ((!T->isDependentType() && !T->isIntegerType() && !T->isRealFloatingType()) || @@ -4762,6 +4762,61 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, // Check for auto functions and trailing return type and adjust the // return type accordingly. if (!D.isInvalidType()) { + auto IsClassType = [&](CXXScopeSpec &SS) { + // If there already was an problem with the scope, don’t issue another + // error about the explicit object parameter. + return SS.isInvalid() || + isa_and_present(S.computeDeclContext(SS)); + }; + + // C++23 [dcl.fct]p6: + // + // An explicit-object-parameter-declaration is a parameter-declaration + // with a this specifier. An explicit-object-parameter-declaration shall + // appear only as the first parameter-declaration of a + // parameter-declaration-list of one of: + // + // - a declaration of a member function or member function template + // ([class.mem]), or + // + // - an explicit instantiation ([temp.explicit]) or explicit + // specialization ([temp.expl.spec]) of a templated member function, + // or + // + // - a lambda-declarator [expr.prim.lambda]. + DeclaratorContext C = D.getContext(); + ParmVarDecl *First = + FTI.NumParams + ? dyn_cast_if_present(FTI.Params[0].Param) + : nullptr; + + bool IsFunctionDecl = D.getInnermostNonParenChunk() == &DeclType; + if (First && First->isExplicitObjectParameter() && + C != DeclaratorContext::LambdaExpr && + + // Either not a member or nested declarator in a member. + // + // Note that e.g. 'static' or 'friend' declarations are accepted + // here; we diagnose them later when we build the member function + // because it's easier that way. + (C != DeclaratorContext::Member || !IsFunctionDecl) && + + // Allow out-of-line definitions of member functions. + !IsClassType(D.getCXXScopeSpec())) { + if (IsFunctionDecl) + S.Diag(First->getBeginLoc(), + diag::err_explicit_object_parameter_nonmember) + << /*non-member*/ 2 << /*function*/ 0 + << First->getSourceRange(); + else + S.Diag(First->getBeginLoc(), + diag::err_explicit_object_parameter_invalid) + << First->getSourceRange(); + + D.setInvalidType(); + AreDeclaratorChunksValid = false; + } + // trailing-return-type is only required if we're declaring a function, // and not, for instance, a pointer to a function. if (D.getDeclSpec().hasAutoTypeSpec() && diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index f6021440cda69f..79bc5e5c55c87e 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -541,9 +541,10 @@ class TreeTransform { /// By default, transforms all of the types and declarations within the /// nested-name-specifier. Subclasses may override this function to provide /// alternate behavior. - NestedNameSpecifierLoc TransformNestedNameSpecifierLoc( - NestedNameSpecifierLoc NNS, QualType ObjectType = QualType(), - ArrayRef UnqualifiedLookups = std::nullopt); + NestedNameSpecifierLoc + TransformNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS, + QualType ObjectType = QualType(), + NamedDecl *FirstQualifierInScope = nullptr); /// Transform the given declaration name. /// @@ -584,11 +585,12 @@ class TreeTransform { /// By default, transforms the template name by transforming the declarations /// and nested-name-specifiers that occur within the template name. /// Subclasses may override this function to provide alternate behavior. - TemplateName TransformTemplateName(CXXScopeSpec &SS, TemplateName Name, - SourceLocation NameLoc, - QualType ObjectType = QualType(), - bool AllowInjectedClassName = false, - bool MayBeNNS = false); + TemplateName + TransformTemplateName(CXXScopeSpec &SS, TemplateName Name, + SourceLocation NameLoc, + QualType ObjectType = QualType(), + NamedDecl *FirstQualifierInScope = nullptr, + bool AllowInjectedClassName = false); /// Transform the given template argument. /// @@ -1138,8 +1140,8 @@ class TreeTransform { CXXScopeSpec SS; SS.Adopt(QualifierLoc); TemplateName InstName = getDerived().RebuildTemplateName( - SS, TemplateKWLoc, *Name, NameLoc, QualType(), AllowInjectedClassName, - /*MayBeNNS=*/false); + SS, TemplateKWLoc, *Name, NameLoc, QualType(), nullptr, + AllowInjectedClassName); if (InstName.isNull()) return QualType(); @@ -1310,7 +1312,8 @@ class TreeTransform { SourceLocation TemplateKWLoc, const IdentifierInfo &Name, SourceLocation NameLoc, QualType ObjectType, - bool AllowInjectedClassName, bool MayBeNNS); + NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName); /// Build a new template name given a nested name specifier and the /// overloaded operator name that is referred to as a template. @@ -2846,14 +2849,15 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new expression. /// Subclasses may override this routine to provide different behavior. - ExprResult - RebuildMemberExpr(Expr *Base, SourceLocation OpLoc, bool isArrow, - NestedNameSpecifierLoc QualifierLoc, - SourceLocation TemplateKWLoc, - const DeclarationNameInfo &MemberNameInfo, - ValueDecl *Member, NamedDecl *FoundDecl, - const TemplateArgumentListInfo *ExplicitTemplateArgs, - ArrayRef UnqualifiedLookups) { + ExprResult RebuildMemberExpr(Expr *Base, SourceLocation OpLoc, + bool isArrow, + NestedNameSpecifierLoc QualifierLoc, + SourceLocation TemplateKWLoc, + const DeclarationNameInfo &MemberNameInfo, + ValueDecl *Member, + NamedDecl *FoundDecl, + const TemplateArgumentListInfo *ExplicitTemplateArgs, + NamedDecl *FirstQualifierInScope) { ExprResult BaseResult = getSema().PerformMemberExprBaseConversion(Base, isArrow); if (!Member->getDeclName()) { @@ -2890,7 +2894,6 @@ class TreeTransform { CXXScopeSpec SS; SS.Adopt(QualifierLoc); - SS.setUnqualifiedLookups(UnqualifiedLookups); Base = BaseResult.get(); if (Base->containsErrors()) @@ -2923,9 +2926,10 @@ class TreeTransform { } return getSema().BuildMemberReferenceExpr(Base, BaseType, OpLoc, isArrow, - SS, TemplateKWLoc, R, - ExplicitTemplateArgs, - /*S=*/nullptr); + SS, TemplateKWLoc, + FirstQualifierInScope, + R, ExplicitTemplateArgs, + /*S*/nullptr); } /// Build a new binary operator expression. @@ -2998,9 +3002,10 @@ class TreeTransform { CXXScopeSpec SS; DeclarationNameInfo NameInfo(&Accessor, AccessorLoc); return getSema().BuildMemberReferenceExpr( - Base, Base->getType(), OpLoc, IsArrow, SS, - /*TemplateKWLoc=*/SourceLocation(), NameInfo, - /*TemplateArgs=*/nullptr, /*S=*/nullptr); + Base, Base->getType(), OpLoc, IsArrow, SS, SourceLocation(), + /*FirstQualifierInScope*/ nullptr, NameInfo, + /* TemplateArgs */ nullptr, + /*S*/ nullptr); } /// Build a new initializer list expression. @@ -3568,37 +3573,46 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new expression. /// Subclasses may override this routine to provide different behavior. - ExprResult RebuildCXXDependentScopeMemberExpr( - Expr *BaseE, QualType BaseType, bool IsArrow, SourceLocation OperatorLoc, - NestedNameSpecifierLoc QualifierLoc, SourceLocation TemplateKWLoc, - ArrayRef UnqualifiedLookups, - const DeclarationNameInfo &MemberNameInfo, - const TemplateArgumentListInfo *TemplateArgs) { + ExprResult RebuildCXXDependentScopeMemberExpr(Expr *BaseE, + QualType BaseType, + bool IsArrow, + SourceLocation OperatorLoc, + NestedNameSpecifierLoc QualifierLoc, + SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, + const DeclarationNameInfo &MemberNameInfo, + const TemplateArgumentListInfo *TemplateArgs) { CXXScopeSpec SS; SS.Adopt(QualifierLoc); - SS.setUnqualifiedLookups(UnqualifiedLookups); - return SemaRef.BuildMemberReferenceExpr( - BaseE, BaseType, OperatorLoc, IsArrow, SS, TemplateKWLoc, - MemberNameInfo, TemplateArgs, /*S=*/nullptr); + return SemaRef.BuildMemberReferenceExpr(BaseE, BaseType, + OperatorLoc, IsArrow, + SS, TemplateKWLoc, + FirstQualifierInScope, + MemberNameInfo, + TemplateArgs, /*S*/nullptr); } /// Build a new member reference expression. /// /// By default, performs semantic analysis to build the new expression. /// Subclasses may override this routine to provide different behavior. - ExprResult RebuildUnresolvedMemberExpr( - Expr *BaseE, QualType BaseType, SourceLocation OperatorLoc, bool IsArrow, - NestedNameSpecifierLoc QualifierLoc, SourceLocation TemplateKWLoc, - ArrayRef UnqualifiedLookups, LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs) { + ExprResult RebuildUnresolvedMemberExpr(Expr *BaseE, QualType BaseType, + SourceLocation OperatorLoc, + bool IsArrow, + NestedNameSpecifierLoc QualifierLoc, + SourceLocation TemplateKWLoc, + NamedDecl *FirstQualifierInScope, + LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs) { CXXScopeSpec SS; SS.Adopt(QualifierLoc); - SS.setUnqualifiedLookups(UnqualifiedLookups); - return SemaRef.BuildMemberReferenceExpr(BaseE, BaseType, OperatorLoc, - IsArrow, SS, TemplateKWLoc, R, - TemplateArgs, /*S=*/nullptr); + return SemaRef.BuildMemberReferenceExpr(BaseE, BaseType, + OperatorLoc, IsArrow, + SS, TemplateKWLoc, + FirstQualifierInScope, + R, TemplateArgs, /*S*/nullptr); } /// Build a new noexcept expression. @@ -3817,8 +3831,10 @@ class TreeTransform { DeclarationNameInfo NameInfo(Ivar->getDeclName(), IvarLoc); ExprResult Result = getSema().BuildMemberReferenceExpr( BaseArg, BaseArg->getType(), - /*FIXME:*/ IvarLoc, IsArrow, SS, /*TemplateKWLoc=*/SourceLocation(), - NameInfo, /*TemplateArgs=*/nullptr, /*S=*/nullptr); + /*FIXME:*/ IvarLoc, IsArrow, SS, SourceLocation(), + /*FirstQualifierInScope=*/nullptr, NameInfo, + /*TemplateArgs=*/nullptr, + /*S=*/nullptr); if (IsFreeIvar && Result.isUsable()) cast(Result.get())->setIsFreeIvar(IsFreeIvar); return Result; @@ -3833,12 +3849,14 @@ class TreeTransform { SourceLocation PropertyLoc) { CXXScopeSpec SS; DeclarationNameInfo NameInfo(Property->getDeclName(), PropertyLoc); - return getSema().BuildMemberReferenceExpr( - BaseArg, BaseArg->getType(), - /*FIXME:*/ PropertyLoc, - /*IsArrow=*/false, SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo, - /*TemplateArgs=*/nullptr, - /*S=*/nullptr); + return getSema().BuildMemberReferenceExpr(BaseArg, BaseArg->getType(), + /*FIXME:*/PropertyLoc, + /*IsArrow=*/false, + SS, SourceLocation(), + /*FirstQualifierInScope=*/nullptr, + NameInfo, + /*TemplateArgs=*/nullptr, + /*S=*/nullptr); } /// Build a new Objective-C property reference expression. @@ -3865,11 +3883,13 @@ class TreeTransform { SourceLocation OpLoc, bool IsArrow) { CXXScopeSpec SS; DeclarationNameInfo NameInfo(&getSema().Context.Idents.get("isa"), IsaLoc); - return getSema().BuildMemberReferenceExpr( - BaseArg, BaseArg->getType(), OpLoc, IsArrow, SS, - /*TemplateKWLoc=*/SourceLocation(), NameInfo, - /*TemplateArgs=*/nullptr, - /*S=*/nullptr); + return getSema().BuildMemberReferenceExpr(BaseArg, BaseArg->getType(), + OpLoc, IsArrow, + SS, SourceLocation(), + /*FirstQualifierInScope=*/nullptr, + NameInfo, + /*TemplateArgs=*/nullptr, + /*S=*/nullptr); } /// Build a new shuffle vector expression. @@ -4034,14 +4054,18 @@ class TreeTransform { } private: - TypeLoc TransformTypeInObjectScope(TypeLoc TL, QualType ObjectType, + TypeLoc TransformTypeInObjectScope(TypeLoc TL, + QualType ObjectType, + NamedDecl *FirstQualifierInScope, CXXScopeSpec &SS); TypeSourceInfo *TransformTypeInObjectScope(TypeSourceInfo *TSInfo, QualType ObjectType, + NamedDecl *FirstQualifierInScope, CXXScopeSpec &SS); TypeSourceInfo *TransformTSIInObjectScope(TypeLoc TL, QualType ObjectType, + NamedDecl *FirstQualifierInScope, CXXScopeSpec &SS); QualType TransformDependentNameType(TypeLocBuilder &TLB, @@ -4360,7 +4384,7 @@ Sema::ConditionResult TreeTransform::TransformCondition( template NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( NestedNameSpecifierLoc NNS, QualType ObjectType, - ArrayRef UnqualifiedLookups) { + NamedDecl *FirstQualifierInScope) { SmallVector Qualifiers; auto insertNNS = [&Qualifiers](NestedNameSpecifierLoc NNS) { @@ -4371,8 +4395,6 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( insertNNS(NNS); CXXScopeSpec SS; - SS.setUnqualifiedLookups(UnqualifiedLookups); - while (!Qualifiers.empty()) { NestedNameSpecifierLoc Q = Qualifiers.pop_back_val(); NestedNameSpecifier *QNNS = Q.getNestedNameSpecifier(); @@ -4382,9 +4404,8 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( Sema::NestedNameSpecInfo IdInfo(QNNS->getAsIdentifier(), Q.getLocalBeginLoc(), Q.getLocalEndLoc(), ObjectType); - if (SemaRef.BuildCXXNestedNameSpecifier(/*Scope=*/nullptr, IdInfo, - /*EnteringContext=*/false, SS, - /*ErrorRecoveryLookup=*/false)) + if (SemaRef.BuildCXXNestedNameSpecifier(/*Scope=*/nullptr, IdInfo, false, + SS, FirstQualifierInScope, false)) return NestedNameSpecifierLoc(); break; } @@ -4422,7 +4443,8 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::TypeSpec: { - TypeLoc TL = TransformTypeInObjectScope(Q.getTypeLoc(), ObjectType, SS); + TypeLoc TL = TransformTypeInObjectScope(Q.getTypeLoc(), ObjectType, + FirstQualifierInScope, SS); if (!TL) return NestedNameSpecifierLoc(); @@ -4455,7 +4477,7 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( } // The qualifier-in-scope and object type only apply to the leftmost entity. - SS.setUnqualifiedLookups(std::nullopt); + FirstQualifierInScope = nullptr; ObjectType = QualType(); } @@ -4538,10 +4560,14 @@ ::TransformDeclarationNameInfo(const DeclarationNameInfo &NameInfo) { llvm_unreachable("Unknown name kind."); } -template -TemplateName TreeTransform::TransformTemplateName( - CXXScopeSpec &SS, TemplateName Name, SourceLocation NameLoc, - QualType ObjectType, bool AllowInjectedClassName, bool MayBeNNS) { +template +TemplateName +TreeTransform::TransformTemplateName(CXXScopeSpec &SS, + TemplateName Name, + SourceLocation NameLoc, + QualType ObjectType, + NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName) { if (QualifiedTemplateName *QTN = Name.getAsQualifiedTemplateName()) { TemplateDecl *Template = QTN->getUnderlyingTemplate().getAsTemplateDecl(); assert(Template && "qualified template name must refer to a template"); @@ -4565,7 +4591,7 @@ TemplateName TreeTransform::TransformTemplateName( if (SS.getScopeRep()) { // These apply to the scope specifier, not the template. ObjectType = QualType(); - SS.setUnqualifiedLookups(std::nullopt); + FirstQualifierInScope = nullptr; } if (!getDerived().AlwaysRebuild() && @@ -4577,9 +4603,13 @@ TemplateName TreeTransform::TransformTemplateName( SourceLocation TemplateKWLoc = NameLoc; if (DTN->isIdentifier()) { - return getDerived().RebuildTemplateName( - SS, TemplateKWLoc, *DTN->getIdentifier(), NameLoc, ObjectType, - AllowInjectedClassName, MayBeNNS); + return getDerived().RebuildTemplateName(SS, + TemplateKWLoc, + *DTN->getIdentifier(), + NameLoc, + ObjectType, + FirstQualifierInScope, + AllowInjectedClassName); } return getDerived().RebuildTemplateName(SS, TemplateKWLoc, @@ -5123,31 +5153,39 @@ QualType TreeTransform::RebuildQualifiedType(QualType T, return SemaRef.BuildQualifiedType(T, Loc, Quals); } -template -TypeLoc TreeTransform::TransformTypeInObjectScope(TypeLoc TL, - QualType ObjectType, - CXXScopeSpec &SS) { +template +TypeLoc +TreeTransform::TransformTypeInObjectScope(TypeLoc TL, + QualType ObjectType, + NamedDecl *UnqualLookup, + CXXScopeSpec &SS) { if (getDerived().AlreadyTransformed(TL.getType())) return TL; - TypeSourceInfo *TSI = TransformTSIInObjectScope(TL, ObjectType, SS); + TypeSourceInfo *TSI = + TransformTSIInObjectScope(TL, ObjectType, UnqualLookup, SS); if (TSI) return TSI->getTypeLoc(); return TypeLoc(); } -template -TypeSourceInfo *TreeTransform::TransformTypeInObjectScope( - TypeSourceInfo *TSInfo, QualType ObjectType, CXXScopeSpec &SS) { +template +TypeSourceInfo * +TreeTransform::TransformTypeInObjectScope(TypeSourceInfo *TSInfo, + QualType ObjectType, + NamedDecl *UnqualLookup, + CXXScopeSpec &SS) { if (getDerived().AlreadyTransformed(TSInfo->getType())) return TSInfo; - return TransformTSIInObjectScope(TSInfo->getTypeLoc(), ObjectType, SS); + return TransformTSIInObjectScope(TSInfo->getTypeLoc(), ObjectType, + UnqualLookup, SS); } template TypeSourceInfo *TreeTransform::TransformTSIInObjectScope( - TypeLoc TL, QualType ObjectType, CXXScopeSpec &SS) { + TypeLoc TL, QualType ObjectType, NamedDecl *UnqualLookup, + CXXScopeSpec &SS) { QualType T = TL.getType(); assert(!getDerived().AlreadyTransformed(T)); @@ -5160,7 +5198,7 @@ TypeSourceInfo *TreeTransform::TransformTSIInObjectScope( TemplateName Template = getDerived().TransformTemplateName( SS, SpecTL.getTypePtr()->getTemplateName(), SpecTL.getTemplateNameLoc(), - ObjectType, /*AllowInjectedClassName=*/true, /*MayBeNNS=*/true); + ObjectType, UnqualLookup, /*AllowInjectedClassName*/true); if (Template.isNull()) return nullptr; @@ -5170,11 +5208,13 @@ TypeSourceInfo *TreeTransform::TransformTSIInObjectScope( DependentTemplateSpecializationTypeLoc SpecTL = TL.castAs(); - TemplateName Template = getDerived().RebuildTemplateName( - SS, SpecTL.getTemplateKeywordLoc(), - *SpecTL.getTypePtr()->getIdentifier(), SpecTL.getTemplateNameLoc(), - ObjectType, - /*AllowInjectedClassName=*/true, /*MayBeNNS=*/true); + TemplateName Template + = getDerived().RebuildTemplateName(SS, + SpecTL.getTemplateKeywordLoc(), + *SpecTL.getTypePtr()->getIdentifier(), + SpecTL.getTemplateNameLoc(), + ObjectType, UnqualLookup, + /*AllowInjectedClassName*/true); if (Template.isNull()) return nullptr; @@ -6697,8 +6737,13 @@ QualType TreeTransform::TransformUnaryTransformType( QualType Result = TL.getType(); if (Result->isDependentType()) { const UnaryTransformType *T = TL.getTypePtr(); - QualType NewBase = - getDerived().TransformType(TL.getUnderlyingTInfo())->getType(); + + TypeSourceInfo *NewBaseTSI = + getDerived().TransformType(TL.getUnderlyingTInfo()); + if (!NewBaseTSI) + return QualType(); + QualType NewBase = NewBaseTSI->getType(); + Result = getDerived().RebuildUnaryTransformType(NewBase, T->getUTTKind(), TL.getKWLoc()); @@ -12318,8 +12363,7 @@ TreeTransform::TransformMemberExpr(MemberExpr *E) { // first-qualifier-in-scope here, just in case we had a dependent // base (and therefore couldn't do the check) and a // nested-name-qualifier (and therefore could do the lookup). - ArrayRef UnqualifiedLookups; - + NamedDecl *FirstQualifierInScope = nullptr; DeclarationNameInfo MemberNameInfo = E->getMemberNameInfo(); if (MemberNameInfo.getName()) { MemberNameInfo = getDerived().TransformDeclarationNameInfo(MemberNameInfo); @@ -12327,11 +12371,16 @@ TreeTransform::TransformMemberExpr(MemberExpr *E) { return ExprError(); } - return getDerived().RebuildMemberExpr( - Base.get(), FakeOperatorLoc, E->isArrow(), QualifierLoc, TemplateKWLoc, - MemberNameInfo, Member, FoundDecl, - (E->hasExplicitTemplateArgs() ? &TransArgs : nullptr), - UnqualifiedLookups); + return getDerived().RebuildMemberExpr(Base.get(), FakeOperatorLoc, + E->isArrow(), + QualifierLoc, + TemplateKWLoc, + MemberNameInfo, + Member, + FoundDecl, + (E->hasExplicitTemplateArgs() + ? &TransArgs : nullptr), + FirstQualifierInScope); } template @@ -13458,8 +13507,9 @@ TreeTransform::TransformCXXPseudoDestructorExpr( PseudoDestructorTypeStorage Destroyed; if (E->getDestroyedTypeInfo()) { - TypeSourceInfo *DestroyedTypeInfo = getDerived().TransformTypeInObjectScope( - E->getDestroyedTypeInfo(), ObjectType, SS); + TypeSourceInfo *DestroyedTypeInfo + = getDerived().TransformTypeInObjectScope(E->getDestroyedTypeInfo(), + ObjectType, nullptr, SS); if (!DestroyedTypeInfo) return ExprError(); Destroyed = DestroyedTypeInfo; @@ -13485,7 +13535,7 @@ TreeTransform::TransformCXXPseudoDestructorExpr( if (E->getScopeTypeInfo()) { CXXScopeSpec EmptySS; ScopeTypeInfo = getDerived().TransformTypeInObjectScope( - E->getScopeTypeInfo(), ObjectType, EmptySS); + E->getScopeTypeInfo(), ObjectType, nullptr, EmptySS); if (!ScopeTypeInfo) return ExprError(); } @@ -14746,17 +14796,19 @@ TreeTransform::TransformCXXDependentScopeMemberExpr( ObjectType = BaseType->castAs()->getPointeeType(); } - UnresolvedSet<4> UnqualifiedLookups; - for (auto D : E->unqualified_lookups()) { - if (NamedDecl *InstD = getDerived().TransformFirstQualifierInScope( - D.getDecl(), E->getQualifierLoc().getBeginLoc())) - UnqualifiedLookups.addDecl(InstD); - } + // Transform the first part of the nested-name-specifier that qualifies + // the member name. + NamedDecl *FirstQualifierInScope + = getDerived().TransformFirstQualifierInScope( + E->getFirstQualifierFoundInScope(), + E->getQualifierLoc().getBeginLoc()); NestedNameSpecifierLoc QualifierLoc; if (E->getQualifier()) { - QualifierLoc = getDerived().TransformNestedNameSpecifierLoc( - E->getQualifierLoc(), ObjectType, UnqualifiedLookups.pairs()); + QualifierLoc + = getDerived().TransformNestedNameSpecifierLoc(E->getQualifierLoc(), + ObjectType, + FirstQualifierInScope); if (!QualifierLoc) return ExprError(); } @@ -14775,16 +14827,23 @@ TreeTransform::TransformCXXDependentScopeMemberExpr( if (!E->hasExplicitTemplateArgs()) { // This is a reference to a member without an explicitly-specified // template argument list. Optimize for this common case. - if (!getDerived().AlwaysRebuild() && Base.get() == OldBase && - BaseType == E->getBaseType() && QualifierLoc == E->getQualifierLoc() && + if (!getDerived().AlwaysRebuild() && + Base.get() == OldBase && + BaseType == E->getBaseType() && + QualifierLoc == E->getQualifierLoc() && NameInfo.getName() == E->getMember() && - UnqualifiedLookups.pairs() == E->unqualified_lookups()) + FirstQualifierInScope == E->getFirstQualifierFoundInScope()) return E; - return getDerived().RebuildCXXDependentScopeMemberExpr( - Base.get(), BaseType, E->isArrow(), E->getOperatorLoc(), QualifierLoc, - TemplateKWLoc, UnqualifiedLookups.pairs(), NameInfo, - /*TemplateArgs*/ nullptr); + return getDerived().RebuildCXXDependentScopeMemberExpr(Base.get(), + BaseType, + E->isArrow(), + E->getOperatorLoc(), + QualifierLoc, + TemplateKWLoc, + FirstQualifierInScope, + NameInfo, + /*TemplateArgs*/nullptr); } TemplateArgumentListInfo TransArgs(E->getLAngleLoc(), E->getRAngleLoc()); @@ -14793,9 +14852,15 @@ TreeTransform::TransformCXXDependentScopeMemberExpr( TransArgs)) return ExprError(); - return getDerived().RebuildCXXDependentScopeMemberExpr( - Base.get(), BaseType, E->isArrow(), E->getOperatorLoc(), QualifierLoc, - TemplateKWLoc, UnqualifiedLookups.pairs(), NameInfo, &TransArgs); + return getDerived().RebuildCXXDependentScopeMemberExpr(Base.get(), + BaseType, + E->isArrow(), + E->getOperatorLoc(), + QualifierLoc, + TemplateKWLoc, + FirstQualifierInScope, + NameInfo, + &TransArgs); } template @@ -14856,11 +14921,11 @@ ExprResult TreeTransform::TransformUnresolvedMemberExpr( // first-qualifier-in-scope here, just in case we had a dependent // base (and therefore couldn't do the check) and a // nested-name-qualifier (and therefore could do the lookup). - ArrayRef UnqualifiedLookups; + NamedDecl *FirstQualifierInScope = nullptr; return getDerived().RebuildUnresolvedMemberExpr( Base.get(), BaseType, Old->getOperatorLoc(), Old->isArrow(), QualifierLoc, - TemplateKWLoc, UnqualifiedLookups, R, + TemplateKWLoc, FirstQualifierInScope, R, (Old->hasExplicitTemplateArgs() ? &TransArgs : nullptr)); } @@ -16217,18 +16282,22 @@ TreeTransform::RebuildTemplateName(CXXScopeSpec &SS, TemplateName(Template)); } -template -TemplateName TreeTransform::RebuildTemplateName( - CXXScopeSpec &SS, SourceLocation TemplateKWLoc, const IdentifierInfo &Name, - SourceLocation NameLoc, QualType ObjectType, bool AllowInjectedClassName, - bool MayBeNNS) { +template +TemplateName +TreeTransform::RebuildTemplateName(CXXScopeSpec &SS, + SourceLocation TemplateKWLoc, + const IdentifierInfo &Name, + SourceLocation NameLoc, + QualType ObjectType, + NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName) { UnqualifiedId TemplateName; TemplateName.setIdentifier(&Name, NameLoc); Sema::TemplateTy Template; getSema().ActOnTemplateName(/*Scope=*/nullptr, SS, TemplateKWLoc, TemplateName, ParsedType::make(ObjectType), /*EnteringContext=*/false, Template, - AllowInjectedClassName, MayBeNNS); + AllowInjectedClassName); return Template.get(); } @@ -16376,10 +16445,13 @@ TreeTransform::RebuildCXXPseudoDestructorExpr(Expr *Base, } SourceLocation TemplateKWLoc; // FIXME: retrieve it from caller. - return getSema().BuildMemberReferenceExpr( - Base, BaseType, OperatorLoc, isArrow, SS, TemplateKWLoc, NameInfo, - /*TemplateArgs=*/nullptr, - /*S=*/nullptr); + return getSema().BuildMemberReferenceExpr(Base, BaseType, + OperatorLoc, isArrow, + SS, TemplateKWLoc, + /*FIXME: FirstQualifier*/ nullptr, + NameInfo, + /*TemplateArgs*/ nullptr, + /*S*/nullptr); } template diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index cbaf1b0a98c614..76032aa836b507 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -1847,13 +1847,8 @@ void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) { // same namespace, and we have an invariant that older declarations // get merged before newer ones try to merge. GlobalDeclID AnonNamespace; - if (Redecl.getFirstID() == ThisDeclID) { + if (Redecl.getFirstID() == ThisDeclID) AnonNamespace = readDeclID(); - } else { - // Link this namespace back to the first declaration, which has already - // been deserialized. - D->AnonOrFirstNamespaceAndFlags.setPointer(D->getFirstDecl()); - } mergeRedeclarable(D, Redecl); @@ -2974,13 +2969,6 @@ void ASTDeclReader::mergeRedeclarable(Redeclarable *DBase, T *Existing, ExistingCanon->Used |= D->Used; D->Used = false; - // When we merge a namespace, update its pointer to the first namespace. - // We cannot have loaded any redeclarations of this declaration yet, so - // there's nothing else that needs to be updated. - if (auto *Namespace = dyn_cast(D)) - Namespace->AnonOrFirstNamespaceAndFlags.setPointer( - assert_cast(ExistingCanon)); - // When we merge a template, merge its pattern. if (auto *DTemplate = dyn_cast(D)) mergeTemplatePattern( @@ -3293,7 +3281,7 @@ ASTDeclReader::getOrFakePrimaryClassDefinition(ASTReader &Reader, DeclContext *ASTDeclReader::getPrimaryContextForMerging(ASTReader &Reader, DeclContext *DC) { if (auto *ND = dyn_cast(DC)) - return ND->getOriginalNamespace(); + return ND->getFirstDecl(); if (auto *RD = dyn_cast(DC)) return getOrFakePrimaryClassDefinition(Reader, RD); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 6ccb4b01a036ac..6955b42f14e06e 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -785,6 +785,12 @@ void ASTStmtReader::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) { E->setRParenLoc(readSourceLocation()); } +static StringRef saveStrToCtx(const std::string &S, ASTContext &Ctx) { + char *Buf = new (Ctx) char[S.size()]; + std::copy(S.begin(), S.end(), Buf); + return StringRef(Buf, S.size()); +} + static ConstraintSatisfaction readConstraintSatisfaction(ASTRecordReader &Record) { ConstraintSatisfaction Satisfaction; @@ -795,7 +801,9 @@ readConstraintSatisfaction(ASTRecordReader &Record) { for (unsigned i = 0; i != NumDetailRecords; ++i) { if (/* IsDiagnostic */Record.readInt()) { SourceLocation DiagLocation = Record.readSourceLocation(); - std::string DiagMessage = Record.readString(); + StringRef DiagMessage = + saveStrToCtx(Record.readString(), Record.getContext()); + Satisfaction.Details.emplace_back( new (Record.getContext()) ConstraintSatisfaction::SubstitutionDiagnostic(DiagLocation, @@ -820,9 +828,13 @@ void ASTStmtReader::VisitConceptSpecializationExpr( static concepts::Requirement::SubstitutionDiagnostic * readSubstitutionDiagnostic(ASTRecordReader &Record) { - std::string SubstitutedEntity = Record.readString(); + StringRef SubstitutedEntity = + saveStrToCtx(Record.readString(), Record.getContext()); + SourceLocation DiagLoc = Record.readSourceLocation(); - std::string DiagMessage = Record.readString(); + StringRef DiagMessage = + saveStrToCtx(Record.readString(), Record.getContext()); + return new (Record.getContext()) concepts::Requirement::SubstitutionDiagnostic{SubstitutedEntity, DiagLoc, DiagMessage}; @@ -909,14 +921,10 @@ void ASTStmtReader::VisitRequiresExpr(RequiresExpr *E) { case concepts::Requirement::RK_Nested: { bool HasInvalidConstraint = Record.readInt(); if (HasInvalidConstraint) { - std::string InvalidConstraint = Record.readString(); - char *InvalidConstraintBuf = - new (Record.getContext()) char[InvalidConstraint.size()]; - std::copy(InvalidConstraint.begin(), InvalidConstraint.end(), - InvalidConstraintBuf); + StringRef InvalidConstraint = + saveStrToCtx(Record.readString(), Record.getContext()); R = new (Record.getContext()) concepts::NestedRequirement( - Record.getContext(), - StringRef(InvalidConstraintBuf, InvalidConstraint.size()), + Record.getContext(), InvalidConstraint, readConstraintSatisfaction(Record)); break; } @@ -1992,43 +2000,42 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr( CXXDependentScopeMemberExpr *E) { VisitExpr(E); - CurrentUnpackingBits.emplace(Record.readInt()); - bool HasQualifier = CurrentUnpackingBits->getNextBit(); - bool HasTemplateInfo = CurrentUnpackingBits->getNextBit(); - unsigned NumUnqualifiedLookups = Record.readInt(); unsigned NumTemplateArgs = Record.readInt(); - E->CXXDependentScopeMemberExprBits.HasQualifier = HasQualifier; - E->CXXDependentScopeMemberExprBits.NumUnqualifiedLookups = - NumUnqualifiedLookups; - E->CXXDependentScopeMemberExprBits.HasTemplateKWAndArgsInfo = HasTemplateInfo; + CurrentUnpackingBits.emplace(Record.readInt()); + bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit(); + bool HasFirstQualifierFoundInScope = CurrentUnpackingBits->getNextBit(); + + assert((HasTemplateKWAndArgsInfo == E->hasTemplateKWAndArgsInfo()) && + "Wrong HasTemplateKWAndArgsInfo!"); + assert( + (HasFirstQualifierFoundInScope == E->hasFirstQualifierFoundInScope()) && + "Wrong HasFirstQualifierFoundInScope!"); + + if (HasTemplateKWAndArgsInfo) + ReadTemplateKWAndArgsInfo( + *E->getTrailingObjects(), + E->getTrailingObjects(), NumTemplateArgs); + + assert((NumTemplateArgs == E->getNumTemplateArgs()) && + "Wrong NumTemplateArgs!"); - E->BaseType = Record.readType(); E->CXXDependentScopeMemberExprBits.IsArrow = CurrentUnpackingBits->getNextBit(); + E->BaseType = Record.readType(); + E->QualifierLoc = Record.readNestedNameSpecifierLoc(); + // not ImplicitAccess if (CurrentUnpackingBits->getNextBit()) E->Base = Record.readSubExpr(); else E->Base = nullptr; - E->OperatorLoc = Record.readSourceLocation(); - E->MemberNameInfo = Record.readDeclarationNameInfo(); + E->CXXDependentScopeMemberExprBits.OperatorLoc = readSourceLocation(); - if (HasQualifier) - new (E->getTrailingObjects()) - NestedNameSpecifierLoc(Record.readNestedNameSpecifierLoc()); - - for (unsigned I = 0; I != NumUnqualifiedLookups; ++I) { - auto *FoundD = Record.readDeclAs(); - auto AS = (AccessSpecifier)Record.readInt(); - E->getTrailingObjects()[I] = - DeclAccessPair::make(FoundD, AS); - } + if (HasFirstQualifierFoundInScope) + *E->getTrailingObjects() = readDeclAs(); - if (HasTemplateInfo) - ReadTemplateKWAndArgsInfo( - *E->getTrailingObjects(), - E->getTrailingObjects(), NumTemplateArgs); + E->MemberNameInfo = Record.readDeclarationNameInfo(); } void @@ -4075,16 +4082,16 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { break; case EXPR_CXX_DEPENDENT_SCOPE_MEMBER: { + unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields]; BitsUnpacker DependentScopeMemberBits( - Record[ASTStmtReader::NumExprFields]); - bool HasQualifier = DependentScopeMemberBits.getNextBit(); - bool HasTemplateInfo = DependentScopeMemberBits.getNextBit(); - unsigned NumUnqualifiedLookups = Record[ASTStmtReader::NumExprFields + 1]; - unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields + 2]; + Record[ASTStmtReader::NumExprFields + 1]); + bool HasTemplateKWAndArgsInfo = DependentScopeMemberBits.getNextBit(); + bool HasFirstQualifierFoundInScope = + DependentScopeMemberBits.getNextBit(); S = CXXDependentScopeMemberExpr::CreateEmpty( - Context, HasQualifier, NumUnqualifiedLookups, HasTemplateInfo, - NumTemplateArgs); + Context, HasTemplateKWAndArgsInfo, NumTemplateArgs, + HasFirstQualifierFoundInScope); break; } diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b6583c54c9ba1f..5dff0cec5c0ea0 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -1383,7 +1383,7 @@ void ASTDeclWriter::VisitNamespaceDecl(NamespaceDecl *D) { Record.AddSourceLocation(D->getBeginLoc()); Record.AddSourceLocation(D->getRBraceLoc()); - if (D->isOriginalNamespace()) + if (D->isFirstDecl()) Record.AddDeclRef(D->getAnonymousNamespace()); Code = serialization::DECL_NAMESPACE; diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 98606bbf8c4a0f..d36f43fdaf2621 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -1986,41 +1986,34 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr( CXXDependentScopeMemberExpr *E) { VisitExpr(E); - bool HasQualifier = E->hasQualifier(); - unsigned NumUnqualifiedLookups = E->getNumUnqualifiedLookups(); - bool HasTemplateInfo = E->hasTemplateKWAndArgsInfo(); - unsigned NumTemplateArgs = E->getNumTemplateArgs(); - - // Write these first for easy access when deserializing, as they affect the - // size of the CXXDependentScopeMemberExpr. + // Don't emit anything here (or if you do you will have to update + // the corresponding deserialization function). + Record.push_back(E->getNumTemplateArgs()); CurrentPackingBits.updateBits(); - CurrentPackingBits.addBit(HasQualifier); - CurrentPackingBits.addBit(HasTemplateInfo); - Record.push_back(NumUnqualifiedLookups); - Record.push_back(NumTemplateArgs); + CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo()); + CurrentPackingBits.addBit(E->hasFirstQualifierFoundInScope()); + + if (E->hasTemplateKWAndArgsInfo()) { + const ASTTemplateKWAndArgsInfo &ArgInfo = + *E->getTrailingObjects(); + AddTemplateKWAndArgsInfo(ArgInfo, + E->getTrailingObjects()); + } - Record.AddTypeRef(E->getBaseType()); CurrentPackingBits.addBit(E->isArrow()); + + Record.AddTypeRef(E->getBaseType()); + Record.AddNestedNameSpecifierLoc(E->getQualifierLoc()); CurrentPackingBits.addBit(!E->isImplicitAccess()); if (!E->isImplicitAccess()) Record.AddStmt(E->getBase()); Record.AddSourceLocation(E->getOperatorLoc()); - Record.AddDeclarationNameInfo(E->MemberNameInfo); - - if (HasQualifier) - Record.AddNestedNameSpecifierLoc(E->getQualifierLoc()); - - for (DeclAccessPair D : E->unqualified_lookups()) { - Record.AddDeclRef(D.getDecl()); - Record.push_back(D.getAccess()); - } - - if (HasTemplateInfo) - AddTemplateKWAndArgsInfo(*E->getTrailingObjects(), - E->getTrailingObjects()); + if (E->hasFirstQualifierFoundInScope()) + Record.AddDeclRef(E->getFirstQualifierFoundInScope()); + Record.AddDeclarationNameInfo(E->MemberNameInfo); Code = serialization::EXPR_CXX_DEPENDENT_SCOPE_MEMBER; } diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl index cb66a703a4ec98..e95acb8896ba4b 100644 --- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s // This test tests two different AST generations. The "EMPTY" test mode verifies @@ -25,10 +25,6 @@ RWBuffer Buffer; #endif -// CHECK: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class Resource definition -// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'void *' - // CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit RWBuffer // CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class RWBuffer definition diff --git a/clang/test/AST/HLSL/ResourceStruct.hlsl b/clang/test/AST/HLSL/ResourceStruct.hlsl deleted file mode 100644 index 04b3b93119903e..00000000000000 --- a/clang/test/AST/HLSL/ResourceStruct.hlsl +++ /dev/null @@ -1,14 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump %s | FileCheck %s - -// CHECK: NamespaceDecl {{.*}} implicit hlsl -// CHECK: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class Resource definition -// CHECK-NEXT: DefinitionData -// CHECK-NEXT: DefaultConstructor exists trivial needs_implicit -// CHECK-NEXT: CopyConstructor simple trivial has_const_param needs_implicit implicit_has_const_param -// CHECK-NEXT: MoveConstructor exists simple trivial needs_implicit -// CHECK-NEXT: CopyAssignment simple trivial has_const_param needs_implicit implicit_has_const_param -// CHECK-NEXT: MoveAssignment exists simple trivial needs_implicit -// CHECK-NEXT: Destructor simple irrelevant trivial needs_implicit -// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> -// implicit h 'void *' diff --git a/clang/test/AST/Interp/c23.c b/clang/test/AST/Interp/c23.c index cf1bf4d4e7d905..e839fc716f5b52 100644 --- a/clang/test/AST/Interp/c23.c +++ b/clang/test/AST/Interp/c23.c @@ -8,4 +8,17 @@ constexpr _Bool inf2 = (1.0/0.0 == __builtin_inf()); // both-error {{must be ini // both-note {{division by zero}} constexpr _Bool inf3 = __builtin_inf() == __builtin_inf(); +/// Used to crash. +struct S { + int x; + char c; + float f; +}; +#define DECL_BUFFER(Ty, Name) alignas(Ty) unsigned char Name[sizeof(Ty)] + +char bar() { + DECL_BUFFER(struct S, buffer); + ((struct S *)buffer)->c = 'a'; + return ((struct S *)buffer)->c; +} diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp index 0eb12643b1b7f4..d68fe995e8fa1c 100644 --- a/clang/test/AST/Interp/lambda.cpp +++ b/clang/test/AST/Interp/lambda.cpp @@ -280,3 +280,9 @@ namespace InvalidCapture { } (); } } + +constexpr int fn() { + int Capture = 42; + return [=]() constexpr { return Capture; }(); +} +static_assert(fn() == 42, ""); diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp index 630d9b53cca259..9cd65462a0af33 100644 --- a/clang/test/AST/Interp/literals.cpp +++ b/clang/test/AST/Interp/literals.cpp @@ -568,37 +568,27 @@ namespace IncDec { return 1; } static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} static_assert(uninit(), ""); // both-error {{not an integral constant expression}} \ - // ref-note {{in call to 'uninit()'}} \ - // expected-note {{in call to 'uninit()'}} + // both-note {{in call to 'uninit()'}} constexpr int OverFlow() { // both-error {{never produces a constant expression}} int a = INT_MAX; @@ -1242,6 +1232,13 @@ namespace Extern { } static_assert(&ExternNonLiteralVarDecl() == &nl, ""); #endif + + struct A { + int b; + }; + + extern constexpr A a{12}; + static_assert(a.b == 12, ""); } #if __cplusplus >= 201402L @@ -1276,3 +1273,33 @@ namespace ComparisonAgainstOnePastEnd { static_assert(&a + 1 == &b + 1, ""); // both-error {{static assertion failed}} }; + +namespace NTTP { + template + constexpr unsigned + size(const _Tp (&)[_Nm]) noexcept + { return _Nm; } + + template + static int write_padding() { + static const char Chars[] = {C}; + + return size(Chars); + } +} + +#if __cplusplus >= 201402L +namespace UnaryOpError { + constexpr int foo() { + int f = 0; + ++g; // both-error {{use of undeclared identifier 'g'}} + return f; + } +} +#endif + +namespace VolatileReads { + const volatile int b = 1; + static_assert(b, ""); // both-error {{not an integral constant expression}} \ + // both-note {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}} +} diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index 4b06fc7522d45c..2fc88a0b1df6a0 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1512,3 +1512,28 @@ namespace OnePastEndAndBack { constexpr const Base *d = c - 1; static_assert(d == &a, ""); } + +namespace BitSet { + class Bitset { + unsigned Bit = 0; + + public: + constexpr Bitset() { + int Init[2] = {1,2}; + for (auto I : Init) + set(I); + } + constexpr void set(unsigned I) { + this->Bit++; + this->Bit = 1u << 1; + } + }; + + struct ArchInfo { + Bitset DefaultExts; + }; + + constexpr ArchInfo ARMV8A = { + Bitset() + }; +} diff --git a/clang/test/AST/Interp/shifts.cpp b/clang/test/AST/Interp/shifts.cpp index 76047d0f752d54..360b87b7ee04f8 100644 --- a/clang/test/AST/Interp/shifts.cpp +++ b/clang/test/AST/Interp/shifts.cpp @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++17 -verify=cxx17 %s -// RUN: %clang_cc1 -std=c++20 -verify=ref %s -// RUN: %clang_cc1 -std=c++17 -verify=ref-cxx17 %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected,all %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++17 -verify=cxx17,all %s +// RUN: %clang_cc1 -std=c++20 -verify=ref,all %s +// RUN: %clang_cc1 -std=c++17 -verify=ref-cxx17,all %s #define INT_MIN (~__INT_MAX__) @@ -198,3 +198,16 @@ namespace LongInt { } static_assert(f() == 1, ""); }; + +enum shiftof { + X = (1<<-29), // all-error {{expression is not an integral constant expression}} \ + // all-note {{negative shift count -29}} + + X2 = (-1<<29), // cxx17-error {{expression is not an integral constant expression}} \ + // cxx17-note {{left shift of negative value -1}} \ + // ref-cxx17-error {{expression is not an integral constant expression}} \ + // ref-cxx17-note {{left shift of negative value -1}} + + X3 = (1<<32) // all-error {{expression is not an integral constant expression}} \ + // all-note {{shift count 32 >= width of type 'int'}} +}; diff --git a/clang/test/C/C2y/n3244.c b/clang/test/C/C2y/n3244.c index c1f62d59d26906..158d7e3921ceb1 100644 --- a/clang/test/C/C2y/n3244.c +++ b/clang/test/C/C2y/n3244.c @@ -56,7 +56,7 @@ int AlignmentOnOriginalDecl; // expected-error {{'_Alignas' must be specified on _Static_assert(_Alignof(AlignmentOnOriginalDecl) == 8, ""); long long CompatibleAlignment; -_Static_assert(_Alignof(CompatibleAlignment) == _Alignof(long long), ""); +_Static_assert(_Alignof(__typeof__(CompatibleAlignment)) == _Alignof(long long), ""); _Alignas(_Alignof(long long)) long long CompatibleAlignment; // Okay, alignment is the same as the implied alignment _Alignas(_Alignof(long long)) long long CompatibleAlignment2; // expected-note {{declared with '_Alignas' attribute here}} diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp index 11eb67fb4f159f..1afea99e8895c7 100644 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp +++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1-cxx11.cpp @@ -55,19 +55,15 @@ namespace PR11856 { template T *end(T*); - struct X { }; - struct Y { - int end; - }; + class X { }; template void Foo2() { T it1; - if (it1->end < it1->end) { } + if (it1->end < it1->end) { + } X *x; - if (x->end < 7) { } // expected-error{{no member named 'end' in 'PR11856::X'}} - - Y *y; - if (y->end < 7) { } + if (x->end < 7) { // expected-error{{no member named 'end' in 'PR11856::X'}} + } } } diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp index 5221b883f046c5..e3599db18350bf 100644 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp +++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp @@ -86,19 +86,15 @@ namespace PR11856 { template T *end(T*); - struct X { }; - struct Y { - int end; - }; + class X { }; template void Foo2() { T it1; - if (it1->end < it1->end) { } + if (it1->end < it1->end) { + } X *x; - if (x->end < 7) { } // expected-error{{no member named 'end' in 'PR11856::X'}} - - Y *y; - if (y->end < 7) { } + if (x->end < 7) { // expected-error{{no member named 'end' in 'PR11856::X'}} + } } } diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3-example3.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3-example3.cpp deleted file mode 100644 index 423eacd21d441e..00000000000000 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3-example3.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %clang_cc1 -std=c++23 %s -verify - -int f(); - -struct A { - int B, C; // expected-note {{declared as a non-template here}} - template using D = void; - using T = void; - void f(); -}; - -using B = A; -template using C = A; -template using D = A; -template using X = A; - -template -void g(T *p) { - p->X<0>::f(); // expected-error {{no member named 'X' in 'A'}} - p->template X<0>::f(); - p->B::f(); - p->template C<0>::f(); // expected-error {{'C' following the 'template' keyword does not refer to a template}} - p->template D<0>::f(); // expected-error {{type 'template D<0>' (aka 'void') cannot be used prior to '::' because it has no members}} - p->T::f(); // expected-error {{'A::T' (aka 'void') is not a class, namespace, or enumeration}} -} - -template void g(A*); // expected-note {{in instantiation of}} diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3.cpp deleted file mode 100644 index 7d843649c3f300..00000000000000 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.qual/basic.lookup.qual.general/p3.cpp +++ /dev/null @@ -1,98 +0,0 @@ -// RUN: %clang_cc1 -std=c++23 -Wno-unused %s -verify - -namespace Unambiguous { - struct A { - int x; - - template - using C = A; - }; - - using B = A; - - template - using D = A; - - using E = void; - - struct F : A { - void non_template() { - this->x; - this->A::x; - this->B::x; - this->C::x; - this->D::x; - this->E::x; // expected-error {{'Unambiguous::E' (aka 'void') is not a class, namespace, or enumeration}} - } - }; - - template - void not_instantiated(T t) { - t.x; - t.A::x; - t.B::x; - t.C::x; // expected-warning {{use 'template' keyword to treat 'C' as a dependent template name}} - t.template C::x; - t.D::x; // expected-warning {{use 'template' keyword to treat 'D' as a dependent template name}} - t.template D::x; - t.E::x; - } - - template - void instantiated_valid(T t) { - t.x; - t.A::x; - t.B::x; - t.template C::x; - t.template D::x; - t.E::x; - } - - template - void instantiated_invalid(T t) { - t.x; - t.A::x; - t.B::x; // expected-error {{'Unambiguous::Invalid::B' (aka 'void') is not a class, namespace, or enumeration}} - t.template C::x; - t.template D::x; // expected-error {{'D' following the 'template' keyword does not refer to a template}} - t.E::x; // expected-error {{'Unambiguous::E' (aka 'void') is not a class, namespace, or enumeration}} - } - - struct Valid : A { - using E = A; - }; - - template void instantiated_valid(Valid); - - struct Invalid : A { - using B = void; - using D = A; // expected-note {{declared as a non-template here}} - }; - - template void instantiated_invalid(Invalid); // expected-note {{in instantiation of}} -} // namespace Unambiguous - -namespace Ambiguous { - inline namespace N { - struct A { }; // expected-note {{candidate found by name lookup is 'Ambiguous::N::A'}} - } - - struct A { }; // expected-note {{candidate found by name lookup is 'Ambiguous::A'}} - - template - void f(T t) { - t.A::x; // expected-error {{reference to 'A' is ambiguous}} - } - - struct B { - using A = B; - - int x; - }; - - struct C { }; - - template void f(B); - template void f(C); // expected-note {{in instantiation of}} - -} // namespace Ambiguous diff --git a/clang/test/CXX/class.derived/class.member.lookup/p8.cpp b/clang/test/CXX/class.derived/class.member.lookup/p8.cpp index 97d3587881bbc1..78e83c0ab4566c 100644 --- a/clang/test/CXX/class.derived/class.member.lookup/p8.cpp +++ b/clang/test/CXX/class.derived/class.member.lookup/p8.cpp @@ -47,8 +47,8 @@ template void DerivedT::Inner() { Derived1T::Foo(); Derived2T::Member = 42; - this->Derived1T::Foo(); // expected-warning{{use 'template' keyword to treat 'Derived1T' as a dependent template name}} - this->Derived2T::Member = 42; // expected-warning{{use 'template' keyword to treat 'Derived2T' as a dependent template name}} + this->Derived1T::Foo(); + this->Derived2T::Member = 42; this->Foo(); // expected-error{{non-static member 'Foo' found in multiple base-class subobjects of type 'BaseT'}} } diff --git a/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp b/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp index 252860bfc4de07..ddf82f432c2eab 100644 --- a/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp +++ b/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp @@ -265,3 +265,21 @@ void f2() { // access info for unnamed bit-field } } + +namespace GH96043 { +template class a {}; +template b c(a); +template class e { +public: + typedef a f; + f begin(); +}; +template constexpr bool operator==(d h, g i) { + return *c(h.begin()) == *c(i.begin()); +} +struct j { + e bar; + bool operator==(const j &) const; +}; +bool j::operator==(const j &) const = default; +} diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 6bca4608184254..e7dddd1ea9278f 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -615,8 +615,10 @@ namespace cwg141 { // cwg141: 3.1 // cxx98-note@#cwg141-S {{lookup from the current scope refers here}} // expected-error@#cwg141-a {{no member named 'n' in 'cwg141::A::S'; did you mean '::cwg141::S::n'?}} // expected-note@#cwg141-S {{'::cwg141::S::n' declared here}} + // FIXME: we issue a useful diagnostic first, then some bogus ones. b.f(); // expected-error@-1 {{no member named 'f' in 'cwg141::B'}} + // expected-error@-2 +{{}} (void)b.S::n; } template struct C { @@ -626,12 +628,10 @@ namespace cwg141 { // cwg141: 3.1 // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} } void h() { - (void)t.S::n; - // expected-error@-1 {{use 'template' keyword to treat 'S' as a dependent template name}} + (void)t.S::n; // ok } void i() { - (void)t.S(); - // expected-error@-1 {{use 'template' keyword to treat 'S' as a dependent template name}} + (void)t.S(); // ok! } }; void h() { C().h(); } // ok diff --git a/clang/test/CXX/drs/cwg20xx.cpp b/clang/test/CXX/drs/cwg20xx.cpp index 9797097acce753..fdd31845d5e0db 100644 --- a/clang/test/CXX/drs/cwg20xx.cpp +++ b/clang/test/CXX/drs/cwg20xx.cpp @@ -313,7 +313,8 @@ namespace cwg2083 { // cwg2083: partial int &r = a.x; // #cwg2083-r struct B { void f() { - // FIXME: We emit more errors than we should be. They are explictly marked below. + // FIXME: We emit more errors than we should be. They are explicitly + // marked below. a.x; // expected-warning@-1 {{expression result unused}} // expected-error@-2 {{reference to local variable 'a' declared in enclosing function 'cwg2083::discarded_lval'}} FIXME diff --git a/clang/test/CXX/temp/temp.names/p3-23.cpp b/clang/test/CXX/temp/temp.names/p3-23.cpp deleted file mode 100644 index 27c24e1d61706e..00000000000000 --- a/clang/test/CXX/temp/temp.names/p3-23.cpp +++ /dev/null @@ -1,237 +0,0 @@ -// RUN: %clang_cc1 -std=c++23 -Wno-unused %s -verify - -namespace FoundNothing { - template - void f0(T &t) { - t.x<0; - t.x<0>; // expected-error {{expected expression}} - t.x<0>1; - } - - template - struct A { - void f1() { - this->x<0; // expected-error {{no member named 'x' in 'A'}} - this->x<0>; // expected-error {{no member named 'x' in 'A'}} - // expected-error@-1 {{expected expression}} - this->x<0>1; // expected-error {{no member named 'x' in 'A'}} - } - }; -} // namespace FoundNothing - -namespace FoundSingleNonTemplate { - void f0(); - - struct A0; - - template - void g0(T &t) { - t.f0<0; - t.f0<0>; // expected-error {{expected expression}} - t.f0<0>1; - - t.A0<0; - t.A0<0>; // expected-error {{expected expression}} - t.A0<0>1; - } - - template - struct B { - void f1(); - - struct A1; // expected-note 3{{member 'A1' declared here}} - - void g1() { - this->f0<0; // expected-error {{no member named 'f0' in 'B'}} - this->f0<0>; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{expected expression}} - this->f0<0>1; // expected-error {{no member named 'f0' in 'B'}} - - this->A0<0; // expected-error {{no member named 'A0' in 'B'}} - this->A0<0>; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{expected expression}} - this->A0<0>1; // expected-error {{no member named 'A0' in 'B'}} - - this->f1<0; // expected-error {{reference to non-static member function must be called}} - this->f1<0>; // expected-error {{reference to non-static member function must be called}} - // expected-error@-1 {{expected expression}} - this->f1<0>1; // expected-error {{reference to non-static member function must be called}} - - this->A1<0; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - this->A1<0>; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - // expected-error@-1 {{expected expression}} - this->A1<0>1; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - } - }; -} // namespace FoundSingleNonTemplate - -namespace FoundSingleTemplate { - template - void f0(); - - template - struct A0; - - template - void g0(T &t) { - t.f0<0; - t.f0<0>; // expected-error {{expected expression}} - t.f0<0>1; - - t.A0<0; - t.A0<0>; // expected-error {{expected expression}} - t.A0<0>1; - } - - template - struct B { - template - void f1(); // expected-note 2{{possible target for call}} - - template - struct A1; // expected-note 2{{member 'A1' declared here}} - - void g1() { - this->f0<0; // expected-error {{no member named 'f0' in 'B'}} - this->f0<0>; // expected-error {{no member named 'f0' in 'B'}} - this->f0<0>1; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{expected ';' after expression}} - - this->A0<0; // expected-error {{no member named 'A0' in 'B'}} - this->A0<0>; // expected-error {{no member named 'A0' in 'B'}} - this->A0<0>1; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{expected ';' after expression}} - - - this->f1<0; // expected-error {{expected '>'}} - // expected-note@-1 {{to match this '<'}} - this->f1<0>; // expected-error {{reference to non-static member function must be called}} - this->f1<0>1; // expected-error {{reference to non-static member function must be called}} - // expected-error@-1 {{expected ';' after expression}} - - this->A1<0; // expected-error {{expected '>'}} - // expected-note@-1 {{to match this '<'}} - this->A1<0>; // expected-error {{cannot refer to member 'A1' in 'B' with '->'}} - this->A1<0>1; // expected-error {{cannot refer to member 'A1' in 'B' with '->'}} - // expected-error@-1 {{expected ';' after expression}} - } - }; -} // namespace FoundSingleTemplate - -namespace FoundAmbiguousNonTemplate { - inline namespace N { - int f0; - - struct A0; - } // namespace N - - void f0(); - - struct A0; - - template - void g0(T &t) { - t.f0<0; - t.f0<0>; // expected-error {{expected expression}} - t.f0<0>1; - - t.A0<0; - t.A0<0>; // expected-error {{expected expression}} - t.A0<0>1; - } - - template - struct B { - void f1(); - - struct A1; // expected-note 3{{member 'A1' declared here}} - - void g1() { - this->f0<0; // expected-error {{no member named 'f0' in 'B'}} - this->f0<0>; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{expected expression}} - this->f0<0>1; // expected-error {{no member named 'f0' in 'B'}} - - this->A0<0; // expected-error {{no member named 'A0' in 'B'}} - this->A0<0>; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{expected expression}} - this->A0<0>1; // expected-error {{no member named 'A0' in 'B'}} - - this->f1<0; // expected-error {{reference to non-static member function must be called}} - this->f1<0>; // expected-error {{reference to non-static member function must be called}} - // expected-error@-1 {{expected expression}} - this->f1<0>1; // expected-error {{reference to non-static member function must be called}} - - this->A1<0; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - this->A1<0>; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - // expected-error@-1 {{expected expression}} - this->A1<0>1; // expected-error {{cannot refer to type member 'A1' in 'B' with '->'}} - } - }; -} // namespace FoundAmbiguousNonTemplates - -namespace FoundAmbiguousTemplate { - inline namespace N { - template - int f0; // expected-note 3{{candidate found by name lookup is 'FoundAmbiguousTemplate::N::f0'}} - - template - struct A0; // expected-note 3{{candidate found by name lookup is 'FoundAmbiguousTemplate::N::A0'}} - } // namespace N - - template - void f0(); // expected-note 3{{candidate found by name lookup is 'FoundAmbiguousTemplate::f0'}} - - template - struct A0; // expected-note 3{{candidate found by name lookup is 'FoundAmbiguousTemplate::A0'}} - - template - void g0(T &t) { - t.f0<0; - t.f0<0>; // expected-error {{expected expression}} - t.f0<0>1; - - t.A0<0; - t.A0<0>; // expected-error {{expected expression}} - t.A0<0>1; - } - - template - struct B { - template - void f1(); // expected-note 2{{possible target for call}} - - template - struct A1; // expected-note 2{{member 'A1' declared here}} - - void g1() { - this->f0<0; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{reference to 'f0' is ambiguous}} - this->f0<0>; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{reference to 'f0' is ambiguous}} - this->f0<0>1; // expected-error {{no member named 'f0' in 'B'}} - // expected-error@-1 {{expected ';' after expression}} - // expected-error@-2 {{reference to 'f0' is ambiguous}} - - this->A0<0; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{reference to 'A0' is ambiguous}} - this->A0<0>; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{reference to 'A0' is ambiguous}} - this->A0<0>1; // expected-error {{no member named 'A0' in 'B'}} - // expected-error@-1 {{expected ';' after expression}} - // expected-error@-2 {{reference to 'A0' is ambiguous}} - - this->f1<0; // expected-error {{expected '>'}} - // expected-note@-1 {{to match this '<'}} - this->f1<0>; // expected-error {{reference to non-static member function must be called}} - this->f1<0>1; // expected-error {{reference to non-static member function must be called}} - // expected-error@-1 {{expected ';' after expression}} - - this->A1<0; // expected-error {{expected '>'}} - // expected-note@-1 {{to match this '<'}} - this->A1<0>; // expected-error {{cannot refer to member 'A1' in 'B' with '->'}} - this->A1<0>1; // expected-error {{cannot refer to member 'A1' in 'B' with '->'}} - // expected-error@-1 {{expected ';' after expression}} - } - }; -} // namespace FoundAmbiguousTemplate diff --git a/clang/test/CXX/temp/temp.res/p3.cpp b/clang/test/CXX/temp/temp.res/p3.cpp index a4d735e05e9b83..37ab93559e3690 100644 --- a/clang/test/CXX/temp/temp.res/p3.cpp +++ b/clang/test/CXX/temp/temp.res/p3.cpp @@ -30,6 +30,6 @@ template int A::template C::*f5() {} // expected-error {{has template template struct A::B { friend A::C f6(); // ok, same as 'friend T f6();' - friend A::C f7(); // expected-warning {{use 'template' keyword to treat 'C' as a dependent template name}} expected-warning {{missing 'typename'}} + friend A::C f7(); // expected-error {{use 'template' keyword to treat 'C' as a dependent template name}} expected-warning {{missing 'typename'}} friend A::template C f8(); // expected-warning {{missing 'typename'}} }; diff --git a/clang/test/CodeGen/2009-06-14-anonymous-union-init.c b/clang/test/CodeGen/2009-06-14-anonymous-union-init.c index 8ccd7bc4ec3384..13f6357f7966d9 100644 --- a/clang/test/CodeGen/2009-06-14-anonymous-union-init.c +++ b/clang/test/CodeGen/2009-06-14-anonymous-union-init.c @@ -1,7 +1,19 @@ -// RUN: %clang_cc1 -emit-llvm < %s | grep "zeroinitializer, i16 16877" +// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefixes=CHECK,EMPTY +// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,EMPTY-MSVC // PR4390 struct sysfs_dirent { - union { struct sysfs_elem_dir {} s_dir; }; + union { struct sysfs_elem_dir { int x; } s_dir; }; unsigned short s_mode; }; struct sysfs_dirent sysfs_root = { {}, 16877 }; + +// CHECK: @sysfs_root = {{.*}}global %struct.sysfs_dirent { %union.anon zeroinitializer, i16 16877 } + +struct Foo { + union { struct empty {} x; }; + unsigned short s_mode; +}; +struct Foo foo = { {}, 16877 }; + +// EMPTY: @foo = {{.*}}global %struct.Foo { i16 16877 } +// EMPTY-MSVC: @foo = {{.*}}global %struct.Foo { [4 x i8] undef, i16 16877 } diff --git a/clang/test/CodeGen/X86/3dnow-builtins.c b/clang/test/CodeGen/X86/3dnow-builtins.c deleted file mode 100644 index af754b71555c41..00000000000000 --- a/clang/test/CodeGen/X86/3dnow-builtins.c +++ /dev/null @@ -1,181 +0,0 @@ -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-sie-ps5 -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK - - -#include - -__m64 test_m_pavgusb(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pavgusb - // GCC-LABEL: define{{.*}} double @test_m_pavgusb - // CHECK: @llvm.x86.3dnow.pavgusb - return _m_pavgusb(m1, m2); -} - -__m64 test_m_pf2id(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pf2id - // GCC-LABEL: define{{.*}} double @test_m_pf2id - // CHECK: @llvm.x86.3dnow.pf2id - return _m_pf2id(m); -} - -__m64 test_m_pfacc(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfacc - // GCC-LABEL: define{{.*}} double @test_m_pfacc - // CHECK: @llvm.x86.3dnow.pfacc - return _m_pfacc(m1, m2); -} - -__m64 test_m_pfadd(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfadd - // GCC-LABEL: define{{.*}} double @test_m_pfadd - // CHECK: @llvm.x86.3dnow.pfadd - return _m_pfadd(m1, m2); -} - -__m64 test_m_pfcmpeq(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpeq - // GCC-LABEL: define{{.*}} double @test_m_pfcmpeq - // CHECK: @llvm.x86.3dnow.pfcmpeq - return _m_pfcmpeq(m1, m2); -} - -__m64 test_m_pfcmpge(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpge - // GCC-LABEL: define{{.*}} double @test_m_pfcmpge - // CHECK: @llvm.x86.3dnow.pfcmpge - return _m_pfcmpge(m1, m2); -} - -__m64 test_m_pfcmpgt(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpgt - // GCC-LABEL: define{{.*}} double @test_m_pfcmpgt - // CHECK: @llvm.x86.3dnow.pfcmpgt - return _m_pfcmpgt(m1, m2); -} - -__m64 test_m_pfmax(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfmax - // GCC-LABEL: define{{.*}} double @test_m_pfmax - // CHECK: @llvm.x86.3dnow.pfmax - return _m_pfmax(m1, m2); -} - -__m64 test_m_pfmin(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfmin - // GCC-LABEL: define{{.*}} double @test_m_pfmin - // CHECK: @llvm.x86.3dnow.pfmin - return _m_pfmin(m1, m2); -} - -__m64 test_m_pfmul(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfmul - // GCC-LABEL: define{{.*}} double @test_m_pfmul - // CHECK: @llvm.x86.3dnow.pfmul - return _m_pfmul(m1, m2); -} - -__m64 test_m_pfrcp(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfrcp - // GCC-LABEL: define{{.*}} double @test_m_pfrcp - // CHECK: @llvm.x86.3dnow.pfrcp - return _m_pfrcp(m); -} - -__m64 test_m_pfrcpit1(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfrcpit1 - // GCC-LABEL: define{{.*}} double @test_m_pfrcpit1 - // CHECK: @llvm.x86.3dnow.pfrcpit1 - return _m_pfrcpit1(m1, m2); -} - -__m64 test_m_pfrcpit2(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfrcpit2 - // GCC-LABEL: define{{.*}} double @test_m_pfrcpit2 - // CHECK: @llvm.x86.3dnow.pfrcpit2 - return _m_pfrcpit2(m1, m2); -} - -__m64 test_m_pfrsqrt(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfrsqrt - // GCC-LABEL: define{{.*}} double @test_m_pfrsqrt - // CHECK: @llvm.x86.3dnow.pfrsqrt - return _m_pfrsqrt(m); -} - -__m64 test_m_pfrsqrtit1(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfrsqrtit1 - // GCC-LABEL: define{{.*}} double @test_m_pfrsqrtit1 - // CHECK: @llvm.x86.3dnow.pfrsqit1 - return _m_pfrsqrtit1(m1, m2); -} - -__m64 test_m_pfsub(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfsub - // GCC-LABEL: define{{.*}} double @test_m_pfsub - // CHECK: @llvm.x86.3dnow.pfsub - return _m_pfsub(m1, m2); -} - -__m64 test_m_pfsubr(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfsubr - // GCC-LABEL: define{{.*}} double @test_m_pfsubr - // CHECK: @llvm.x86.3dnow.pfsubr - return _m_pfsubr(m1, m2); -} - -__m64 test_m_pi2fd(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pi2fd - // GCC-LABEL: define{{.*}} double @test_m_pi2fd - // CHECK: @llvm.x86.3dnow.pi2fd - return _m_pi2fd(m); -} - -__m64 test_m_pmulhrw(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pmulhrw - // GCC-LABEL: define{{.*}} double @test_m_pmulhrw - // CHECK: @llvm.x86.3dnow.pmulhrw - return _m_pmulhrw(m1, m2); -} - -__m64 test_m_pf2iw(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pf2iw - // GCC-LABEL: define{{.*}} double @test_m_pf2iw - // CHECK: @llvm.x86.3dnowa.pf2iw - return _m_pf2iw(m); -} - -__m64 test_m_pfnacc(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfnacc - // GCC-LABEL: define{{.*}} double @test_m_pfnacc - // CHECK: @llvm.x86.3dnowa.pfnacc - return _m_pfnacc(m1, m2); -} - -__m64 test_m_pfpnacc(__m64 m1, __m64 m2) { - // PS4-LABEL: define{{.*}} i64 @test_m_pfpnacc - // GCC-LABEL: define{{.*}} double @test_m_pfpnacc - // CHECK: @llvm.x86.3dnowa.pfpnacc - return _m_pfpnacc(m1, m2); -} - -__m64 test_m_pi2fw(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pi2fw - // GCC-LABEL: define{{.*}} double @test_m_pi2fw - // CHECK: @llvm.x86.3dnowa.pi2fw - return _m_pi2fw(m); -} - -__m64 test_m_pswapdsf(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pswapdsf - // GCC-LABEL: define{{.*}} double @test_m_pswapdsf - // CHECK: @llvm.x86.3dnowa.pswapd - return _m_pswapdsf(m); -} - -__m64 test_m_pswapdsi(__m64 m) { - // PS4-LABEL: define{{.*}} i64 @test_m_pswapdsi - // GCC-LABEL: define{{.*}} double @test_m_pswapdsi - // CHECK: @llvm.x86.3dnowa.pswapd - return _m_pswapdsi(m); -} diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index 1e0f129b986102..d26db19574051a 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -364,27 +364,31 @@ __builtin_floor(f); __builtin_floorf(f); __builtin_floorl(f); __builtin // HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]] -__builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); +__builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); __builtin_fmaf16(f,f,f); // NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]] // NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] // NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] // NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare fp128 @fmaf128(fp128 noundef, fp128 noundef, fp128 noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare half @fmaf16(half noundef, half noundef, half noundef) [[NOT_READNONE]] // On GNU or Win, fma never sets errno, so we can convert to the intrinsic. // HAS_ERRNO_GNU: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO_GNU: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] // HAS_ERRNO_GNU: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// HAS_ERRNO_GNU: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]] // HAS_ERRNO_WIN: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO_WIN: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] // Long double is just double on win, so no f80 use/declaration. // HAS_ERRNO_WIN-NOT: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) +// HAS_ERRNO_WIN: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]] __builtin_fmax(f,f); __builtin_fmaxf(f,f); __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f); diff --git a/clang/test/CodeGen/X86/x86_64-vaarg.c b/clang/test/CodeGen/X86/x86_64-vaarg.c index d6b885d9fb18c3..7d5102f93ca6f0 100644 --- a/clang/test/CodeGen/X86/x86_64-vaarg.c +++ b/clang/test/CodeGen/X86/x86_64-vaarg.c @@ -56,7 +56,8 @@ typedef struct { // CHECK: vaarg.end: // CHECK-NEXT: [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP1]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ] // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[VAARG_ADDR]], i64 8, i1 false) -// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[COERCE:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[COERCE]], align 8 // CHECK-NEXT: ret double [[TMP3]] // s1 f(int z, ...) { diff --git a/clang/test/CodeGen/aarch64-byval-temp.c b/clang/test/CodeGen/aarch64-byval-temp.c index 0384830c69a419..0ee0312b2362de 100644 --- a/clang/test/CodeGen/aarch64-byval-temp.c +++ b/clang/test/CodeGen/aarch64-byval-temp.c @@ -80,33 +80,41 @@ void example_BitInt(void) { } // CHECK-O0-LABEL: define dso_local void @example_BitInt( // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[L:%.*]] = alloca i129, align 16 -// CHECK-O0-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca i129, align 16 -// CHECK-O0-NEXT: [[INDIRECT_ARG_TEMP1:%.*]] = alloca i129, align 16 -// CHECK-O0-NEXT: store i129 0, ptr [[L]], align 16 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i129, ptr [[L]], align 16 -// CHECK-O0-NEXT: store i129 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 16 +// CHECK-O0-NEXT: [[L:%.*]] = alloca i256, align 16 +// CHECK-O0-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca i256, align 16 +// CHECK-O0-NEXT: [[INDIRECT_ARG_TEMP1:%.*]] = alloca i256, align 16 +// CHECK-O0-NEXT: store i256 0, ptr [[L]], align 16 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i256, ptr [[L]], align 16 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i256 [[TMP0]] to i129 +// CHECK-O0-NEXT: [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256 +// CHECK-O0-NEXT: store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16 // CHECK-O0-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]]) -// CHECK-O0-NEXT: [[TMP1:%.*]] = load i129, ptr [[L]], align 16 -// CHECK-O0-NEXT: store i129 [[TMP1]], ptr [[INDIRECT_ARG_TEMP1]], align 16 +// CHECK-O0-NEXT: [[TMP1:%.*]] = load i256, ptr [[L]], align 16 +// CHECK-O0-NEXT: [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129 +// CHECK-O0-NEXT: [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256 +// CHECK-O0-NEXT: store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16 // CHECK-O0-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]]) // CHECK-O0-NEXT: ret void // // CHECK-O3-LABEL: define dso_local void @example_BitInt( // CHECK-O3-NEXT: entry: -// CHECK-O3-NEXT: [[L:%.*]] = alloca i129, align 16 -// CHECK-O3-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca i129, align 16 -// CHECK-O3-NEXT: [[INDIRECT_ARG_TEMP1:%.*]] = alloca i129, align 16 +// CHECK-O3-NEXT: [[L:%.*]] = alloca i256, align 16 +// CHECK-O3-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca i256, align 16 +// CHECK-O3-NEXT: [[INDIRECT_ARG_TEMP1:%.*]] = alloca i256, align 16 // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[L]]) -// CHECK-O3-NEXT: store i129 0, ptr [[L]], align 16, !tbaa [[TBAA6:![0-9]+]] -// CHECK-O3-NEXT: [[TMP0:%.*]] = load i129, ptr [[L]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: store i256 0, ptr [[L]], align 16, !tbaa [[TBAA6:![0-9]+]] +// CHECK-O3-NEXT: [[TMP0:%.*]] = load i256, ptr [[L]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: [[LOADEDV:%.*]] = trunc i256 [[TMP0]] to i129 // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) -// CHECK-O3-NEXT: store i129 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256 +// CHECK-O3-NEXT: store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16, !tbaa [[TBAA6]] // CHECK-O3-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) -// CHECK-O3-NEXT: [[TMP1:%.*]] = load i129, ptr [[L]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: [[TMP1:%.*]] = load i256, ptr [[L]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129 // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) -// CHECK-O3-NEXT: store i129 [[TMP1]], ptr [[INDIRECT_ARG_TEMP1]], align 16, !tbaa [[TBAA6]] +// CHECK-O3-NEXT: [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256 +// CHECK-O3-NEXT: store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16, !tbaa [[TBAA6]] // CHECK-O3-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[L]]) diff --git a/clang/test/CodeGen/asan-destructor-kind.cpp b/clang/test/CodeGen/asan-destructor-kind.cpp index 50188067c68b32..73e9185a65fc96 100644 --- a/clang/test/CodeGen/asan-destructor-kind.cpp +++ b/clang/test/CodeGen/asan-destructor-kind.cpp @@ -9,7 +9,7 @@ // RUN: %clang_cc1 -fsanitize=address -emit-llvm -o - -triple x86_64-apple-macosx10.15 %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-GLOBAL-DTOR -// Explictly ask for global dtor +// Explicitly ask for global dtor // RUN: %clang_cc1 -fsanitize=address \ // RUN: -fsanitize-address-destructor=global -emit-llvm -o - \ // RUN: -triple x86_64-apple-macosx10.15 %s | \ @@ -18,7 +18,7 @@ // CHECK-GLOBAL-DTOR: llvm.global_dtor{{.+}}asan.module_dtor // CHECK-GLOBAL-DTOR: define internal void @asan.module_dtor -// Explictly ask for no dtors +// Explicitly ask for no dtors // RUN: %clang_cc1 -fsanitize=address \ // RUN: -fsanitize-address-destructor=none -emit-llvm -o - \ // RUN: -triple x86_64-apple-macosx10.15 %s | \ diff --git a/clang/test/CodeGen/attr-noundef.cpp b/clang/test/CodeGen/attr-noundef.cpp index e1cab091bfcbfb..abdf9496bd3963 100644 --- a/clang/test/CodeGen/attr-noundef.cpp +++ b/clang/test/CodeGen/attr-noundef.cpp @@ -157,11 +157,10 @@ void pass_large_BitInt(_BitInt(127) e) { // CHECK: [[DEF]] ptr @{{.*}}ret_npt{{.*}}() // CHECK: [[DEF]] void @{{.*}}pass_npt{{.*}}(ptr % -// TODO: for now, ExtInt is only noundef if it is sign/zero-extended // CHECK-INTEL: [[DEF]] noundef signext i3 @{{.*}}ret_BitInt{{.*}}() -// CHECK-AARCH: [[DEF]] i3 @{{.*}}ret_BitInt{{.*}}() +// CHECK-AARCH: [[DEF]] noundef i3 @{{.*}}ret_BitInt{{.*}}() // CHECK-INTEL: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 noundef signext % -// CHECK-AARCH: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 % -// CHECK-INTEL: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i64 %{{.*}}, i64 % -// CHECK-AARCH: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i127 % +// CHECK-AARCH: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 noundef % +// CHECK-INTEL: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i64 noundef %{{.*}}, i64 noundef % +// CHECK-AARCH: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i127 noundef % } // namespace check_exotic diff --git a/clang/test/CodeGen/bitfield-access-pad.c b/clang/test/CodeGen/bitfield-access-pad.c index 8608c5bd8be116..edda7b7798d057 100644 --- a/clang/test/CodeGen/bitfield-access-pad.c +++ b/clang/test/CodeGen/bitfield-access-pad.c @@ -16,6 +16,7 @@ // Configs that have expensive unaligned access // Little Endian // RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-T %s +// RUN: %clang_cc1 -triple=le64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-T %s // Big endian // RUN: %clang_cc1 -triple=m68k-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-T %s diff --git a/clang/test/CodeGen/bitfield-access-unit.c b/clang/test/CodeGen/bitfield-access-unit.c index c1b0a43cccc885..d0553c5183eeff 100644 --- a/clang/test/CodeGen/bitfield-access-unit.c +++ b/clang/test/CodeGen/bitfield-access-unit.c @@ -53,8 +53,8 @@ // RUN: %clang_cc1 -triple=sparc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s // RUN: %clang_cc1 -triple=tce-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s -// m68-elf is a strict alignment ISA with 4-byte aligned 64-bit or 2-byte -// aligned 32-bit integer types. This more compex to describe here. +// Both le64-elf and m68-elf are strict alignment ISAs with 4-byte aligned +// 64-bit or 2-byte aligned 32-bit integer types. This more compex to describe here. // If unaligned access is expensive don't stick these together. struct A { diff --git a/clang/test/CodeGen/builtins-bitint.c b/clang/test/CodeGen/builtins-bitint.c index 804e4971287737..207ff388a28764 100644 --- a/clang/test/CodeGen/builtins-bitint.c +++ b/clang/test/CodeGen/builtins-bitint.c @@ -8,10 +8,11 @@ // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi1( // CHECK-O0-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1 -// CHECK-O0-NEXT: store i1 true, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctpop.i1(i1 [[TMP0]]) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 1, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctpop.i1(i1 [[LOADEDV]]) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // @@ -28,10 +29,11 @@ int test_popcountg_ubi1() { // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi2( // CHECK-O0-SAME: ) #[[ATTR0]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1 -// CHECK-O0-NEXT: store i2 -1, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctpop.i2(i2 [[TMP0]]) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 3, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i2 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctpop.i2(i2 [[LOADEDV]]) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // @@ -48,10 +50,11 @@ int test_popcountg_ubi2() { // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi1( // CHECK-O0-SAME: ) #[[ATTR0]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1 -// CHECK-O0-NEXT: store i1 false, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.cttz.i1(i1 [[TMP0]], i1 false) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.cttz.i1(i1 [[LOADEDV]], i1 false) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // @@ -68,10 +71,11 @@ int test_ctzg_ubi1() { // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi2( // CHECK-O0-SAME: ) #[[ATTR0]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1 -// CHECK-O0-NEXT: store i2 0, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.cttz.i2(i2 [[TMP0]], i1 false) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i2 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.cttz.i2(i2 [[LOADEDV]], i1 false) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // @@ -88,10 +92,11 @@ int test_ctzg_ubi2() { // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi1( // CHECK-O0-SAME: ) #[[ATTR0]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1 -// CHECK-O0-NEXT: store i1 false, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctlz.i1(i1 [[TMP0]], i1 false) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctlz.i1(i1 [[LOADEDV]], i1 false) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // @@ -108,10 +113,11 @@ int test_clzg_ubi1() { // CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi2( // CHECK-O0-SAME: ) #[[ATTR0]] { // CHECK-O0-NEXT: entry: -// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1 -// CHECK-O0-NEXT: store i2 0, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1 -// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctlz.i2(i2 [[TMP0]], i1 false) +// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1 +// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i2 +// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctlz.i2(i2 [[LOADEDV]], i1 false) // CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32 // CHECK-O0-NEXT: ret i32 [[CAST]] // diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c index b52a11cca1990d..8fb52992c0fe68 100644 --- a/clang/test/CodeGen/builtins-elementwise-math.c +++ b/clang/test/CodeGen/builtins-elementwise-math.c @@ -44,8 +44,9 @@ void test_builtin_elementwise_abs(float f1, float f2, double d1, double d2, const si8 cvi2 = vi2; vi2 = __builtin_elementwise_abs(cvi2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[BI1]], i1 false) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[LOADEDV]], i1 false) bi2 = __builtin_elementwise_abs(bi1); // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4 @@ -92,14 +93,18 @@ void test_builtin_elementwise_add_sat(float f1, float f2, double d1, double d2, // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) vu1 = __builtin_elementwise_add_sat(vu1, vu2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4 - // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[BI1]], i31 [[BI2]]) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]]) bi1 = __builtin_elementwise_add_sat(bi1, bi2); - // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8 - // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8 - // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[BU1]], i55 [[BU2]]) + // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8 + // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55 + // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8 + // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55 + // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]]) bu1 = __builtin_elementwise_add_sat(bu1, bu2); // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4 @@ -141,14 +146,18 @@ void test_builtin_elementwise_sub_sat(float f1, float f2, double d1, double d2, // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) vu1 = __builtin_elementwise_sub_sat(vu1, vu2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4 - // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[BI1]], i31 [[BI2]]) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]]) bi1 = __builtin_elementwise_sub_sat(bi1, bi2); - // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8 - // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8 - // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[BU1]], i55 [[BU2]]) + // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8 + // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55 + // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8 + // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55 + // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]]) bu1 = __builtin_elementwise_sub_sat(bu1, bu2); // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4 @@ -169,7 +178,7 @@ void test_builtin_elementwise_max(float f1, float f2, double d1, double d2, // CHECK-LABEL: define void @test_builtin_elementwise_max( // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4 - // CHECK-NEXT: call float @llvm.maxnum.f32(float %0, float %1) + // CHECK-NEXT: call float @llvm.maxnum.f32(float [[F1]], float [[F2]]) f1 = __builtin_elementwise_max(f1, f2); // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8 @@ -210,14 +219,18 @@ void test_builtin_elementwise_max(float f1, float f2, double d1, double d2, // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) vu1 = __builtin_elementwise_max(vu1, vu2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4 - // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[BI1]], i31 [[BI2]]) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]]) bi1 = __builtin_elementwise_max(bi1, bi2); - // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8 - // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8 - // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[BU1]], i55 [[BU2]]) + // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8 + // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55 + // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8 + // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55 + // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]]) bu1 = __builtin_elementwise_max(bu1, bu2); // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16 @@ -249,7 +262,7 @@ void test_builtin_elementwise_min(float f1, float f2, double d1, double d2, // CHECK-LABEL: define void @test_builtin_elementwise_min( // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4 - // CHECK-NEXT: call float @llvm.minnum.f32(float %0, float %1) + // CHECK-NEXT: call float @llvm.minnum.f32(float [[F1]], float [[F2]]) f1 = __builtin_elementwise_min(f1, f2); // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8 @@ -296,14 +309,18 @@ void test_builtin_elementwise_min(float f1, float f2, double d1, double d2, // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) vu1 = __builtin_elementwise_min(vu1, vu2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4 - // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[BI1]], i31 [[BI2]]) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]]) bi1 = __builtin_elementwise_min(bi1, bi2); - // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8 - // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8 - // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[BU1]], i55 [[BU2]]) + // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8 + // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55 + // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8 + // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55 + // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]]) bu1 = __builtin_elementwise_min(bu1, bu2); // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16 @@ -341,8 +358,9 @@ void test_builtin_elementwise_bitreverse(si8 vi1, si8 vi2, const si8 cvi2 = vi2; vi2 = __builtin_elementwise_bitreverse(cvi2); - // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4 - // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[BI1]]) + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[LOADEDV]]) bi2 = __builtin_elementwise_bitreverse(bi1); // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4 diff --git a/clang/test/CodeGen/builtins-overflow.c b/clang/test/CodeGen/builtins-overflow.c index 4babc05759dc8a..7c524723f76e8e 100644 --- a/clang/test/CodeGen/builtins-overflow.c +++ b/clang/test/CodeGen/builtins-overflow.c @@ -42,11 +42,13 @@ int test_add_overflow_int_int_int(int x, int y) { int test_add_overflow_xint31_xint31_xint31(_BitInt(31) x, _BitInt(31) y) { // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_xint31_xint31_xint31({{.+}}) + // CHECK: %loadedv = trunc i32 %{{.*}} to i31 // CHECK-NOT: ext // CHECK: [[S:%.+]] = call { i31, i1 } @llvm.sadd.with.overflow.i31(i31 %{{.+}}, i31 %{{.+}}) // CHECK-DAG: [[C:%.+]] = extractvalue { i31, i1 } [[S]], 1 // CHECK-DAG: [[Q:%.+]] = extractvalue { i31, i1 } [[S]], 0 - // CHECK: store i31 [[Q]], ptr + // CHECK: [[STOREDV:%.+]] = sext i31 [[Q]] to i32 + // CHECK: store i32 [[STOREDV]], ptr // CHECK: br i1 [[C]] _BitInt(31) r; if (__builtin_add_overflow(x, y, &r)) @@ -84,11 +86,13 @@ int test_sub_overflow_int_int_int(int x, int y) { int test_sub_overflow_xint31_xint31_xint31(_BitInt(31) x, _BitInt(31) y) { // CHECK-LABEL: define {{(dso_local )?}}i32 @test_sub_overflow_xint31_xint31_xint31({{.+}}) + // CHECK: %loadedv = trunc i32 %{{.*}} to i31 // CHECK-NOT: ext // CHECK: [[S:%.+]] = call { i31, i1 } @llvm.ssub.with.overflow.i31(i31 %{{.+}}, i31 %{{.+}}) // CHECK-DAG: [[C:%.+]] = extractvalue { i31, i1 } [[S]], 1 // CHECK-DAG: [[Q:%.+]] = extractvalue { i31, i1 } [[S]], 0 - // CHECK: store i31 [[Q]], ptr + // CHECK: [[STOREDV:%.+]] = sext i31 [[Q]] to i32 + // CHECK: store i32 [[STOREDV]], ptr // CHECK: br i1 [[C]] _BitInt(31) r; if (__builtin_sub_overflow(x, y, &r)) @@ -171,11 +175,13 @@ int test_mul_overflow_int_int_int(int x, int y) { int test_mul_overflow_xint31_xint31_xint31(_BitInt(31) x, _BitInt(31) y) { // CHECK-LABEL: define {{(dso_local )?}}i32 @test_mul_overflow_xint31_xint31_xint31({{.+}}) + // CHECK: %loadedv = trunc i32 %{{.*}} to i31 // CHECK-NOT: ext // CHECK: [[S:%.+]] = call { i31, i1 } @llvm.smul.with.overflow.i31(i31 %{{.+}}, i31 %{{.+}}) // CHECK-DAG: [[C:%.+]] = extractvalue { i31, i1 } [[S]], 1 // CHECK-DAG: [[Q:%.+]] = extractvalue { i31, i1 } [[S]], 0 - // CHECK: store i31 [[Q]], ptr + // CHECK: [[STOREDV:%.+]] = sext i31 [[Q]] to i32 + // CHECK: store i32 [[STOREDV]], ptr // CHECK: br i1 [[C]] _BitInt(31) r; if (__builtin_mul_overflow(x, y, &r)) @@ -185,11 +191,13 @@ int test_mul_overflow_xint31_xint31_xint31(_BitInt(31) x, _BitInt(31) y) { int test_mul_overflow_xint127_xint127_xint127(_BitInt(127) x, _BitInt(127) y) { // CHECK-LABEL: define {{(dso_local )?}}i32 @test_mul_overflow_xint127_xint127_xint127({{.+}}) + // CHECK: %loadedv = trunc i128 %{{.*}} to i127 // CHECK-NOT: ext // CHECK: [[S:%.+]] = call { i127, i1 } @llvm.smul.with.overflow.i127(i127 %{{.+}}, i127 %{{.+}}) // CHECK-DAG: [[C:%.+]] = extractvalue { i127, i1 } [[S]], 1 // CHECK-DAG: [[Q:%.+]] = extractvalue { i127, i1 } [[S]], 0 - // CHECK: store i127 [[Q]], ptr + // CHECK: [[STOREDV:%.+]] = sext i127 [[Q]] to i128 + // CHECK: store i128 [[STOREDV]], ptr // CHECK: br i1 [[C]] _BitInt(127) r; if (__builtin_mul_overflow(x, y, &r)) diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c index e0f220dbeafccd..de31a4db5b0c18 100644 --- a/clang/test/CodeGen/builtins-x86.c +++ b/clang/test/CodeGen/builtins-x86.c @@ -3,7 +3,6 @@ // RUN: %clang_cc1 -DUSE_64 -DOPENCL -x cl -cl-std=CL2.0 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -emit-llvm -o %t %s #ifdef USE_ALL -#define USE_3DNOW #define USE_64 #define USE_SSE4 #endif @@ -96,9 +95,6 @@ void f0(void) { V4s tmp_V4s; V2i tmp_V2i; V1LLi tmp_V1LLi; -#ifdef USE_3DNOW - V2f tmp_V2f; -#endif // 128-bit V16c tmp_V16c; @@ -513,33 +509,7 @@ void f0(void) { __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d); __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f); -#ifdef USE_3DNOW - tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c); - tmp_V2i = __builtin_ia32_pf2id(tmp_V2f); - tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f); - tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f); - tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f); - tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f); - tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f); - tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i); - tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s); - tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f); - tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f); - tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i); - tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f); - tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i); - +#if USE_ALL tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4); tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i); diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c index 6cc3a10a1e7946..42c9e3c5008a39 100644 --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -78,12 +78,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _ // CHECK: call x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80 %{{.*}}, metadata !"fpexcept.strict") // CHECK: call fp128 @llvm.experimental.constrained.floor.f128(fp128 %{{.*}}, metadata !"fpexcept.strict") - __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); + __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); __builtin_fmaf16(f,f,f); // CHECK: call double @llvm.experimental.constrained.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK: call float @llvm.experimental.constrained.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK: call x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80 %{{.*}}, x86_fp80 %{{.*}}, x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK: call fp128 @llvm.experimental.constrained.fma.f128(fp128 %{{.*}}, fp128 %{{.*}}, fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call half @llvm.experimental.constrained.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, metadata !"fpexcept.strict") __builtin_fmax(f,f); __builtin_fmaxf(f,f); __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f); diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c index 508728172ab4a3..05b2bf1bec81ee 100644 --- a/clang/test/CodeGen/ext-int-cc.c +++ b/clang/test/CodeGen/ext-int-cc.c @@ -67,29 +67,29 @@ void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {} // WIN64: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) // LIN32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) // WIN32: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) -// NACL: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) +// NACL: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // NVPTX64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) -// NVPTX: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) +// NVPTX: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // SPARCV9: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) -// SPARC: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) +// SPARC: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // MIPS64: define{{.*}} void @ParamPassing2(i127 signext %{{.+}}, i63 signext %{{.+}}) -// MIPS: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 signext %{{.+}}) -// SPIR64: define{{.*}} spir_func void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) -// SPIR: define{{.*}} spir_func void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) -// HEX: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) -// LANAI: define{{.*}} void @ParamPassing2(ptr byval(i127) align 4 %{{.+}}, i63 %{{.+}}) -// R600: define{{.*}} void @ParamPassing2(ptr addrspace(5) byval(i127) align 8 %{{.+}}, i63 %{{.+}}) -// ARC: define{{.*}} void @ParamPassing2(ptr byval(i127) align 4 %{{.+}}, i63 inreg %{{.+}}) -// XCORE: define{{.*}} void @ParamPassing2(ptr byval(i127) align 4 %{{.+}}, i63 %{{.+}}) +// MIPS: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 signext %{{.+}}) +// SPIR64: define{{.*}} spir_func void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) +// SPIR: define{{.*}} spir_func void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) +// HEX: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) +// LANAI: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 %{{.+}}) +// R600: define{{.*}} void @ParamPassing2(ptr addrspace(5) byval(i128) align 8 %{{.+}}, i63 %{{.+}}) +// ARC: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 inreg %{{.+}}) +// XCORE: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 %{{.+}}) // RISCV64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) // RISCV32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) // WASM: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) // SYSTEMZ: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 signext %{{.+}}) // PPC64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) -// PPC32: define{{.*}} void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) +// PPC32: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // AARCH64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) // AARCH64DARWIN: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) -// ARM: define{{.*}} arm_aapcscc void @ParamPassing2(ptr byval(i127) align 8 %{{.+}}, i63 %{{.+}}) +// ARM: define{{.*}} arm_aapcscc void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // LA64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) // LA32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) @@ -131,7 +131,7 @@ void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {} // are negated. This will give an error when a target does support larger // _BitInt widths to alert us to enable the test. void ParamPassing4(_BitInt(129) a) {} -// LIN64: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) +// LIN64: define{{.*}} void @ParamPassing4(ptr byval([24 x i8]) align 8 %{{.+}}) // WIN64: define dso_local void @ParamPassing4(ptr %{{.+}}) // LIN32: define{{.*}} void @ParamPassing4(ptr %{{.+}}) // WIN32: define dso_local void @ParamPassing4(ptr %{{.+}}) diff --git a/clang/test/CodeGen/ext-int-sanitizer.cpp b/clang/test/CodeGen/ext-int-sanitizer.cpp index 85ae26c72f45f5..f7c6db7236290c 100644 --- a/clang/test/CodeGen/ext-int-sanitizer.cpp +++ b/clang/test/CodeGen/ext-int-sanitizer.cpp @@ -55,12 +55,15 @@ void FloatOverflow(float f, double d) { void UIntTruncation(unsigned _BitInt(35) E, unsigned int i, unsigned long long ll) { i = E; - // CHECK: %[[LOADE:.+]] = load i35 - // CHECK: store i35 %[[LOADE]], ptr %[[EADDR:.+]] - // CHECK: %[[LOADE2:.+]] = load i35, ptr %[[EADDR]] - // CHECK: %[[CONV:.+]] = trunc i35 %[[LOADE2]] to i32 + // CHECK: %[[LOADE:.+]] = load i64 + // CHECK: %[[E1:.+]] = trunc i64 %[[LOADE]] to i35 + // CHECK: %[[STOREDV:.+]] = zext i35 %[[E1]] to i64 + // CHECK: store i64 %[[STOREDV]], ptr %[[EADDR:.+]] + // CHECK: %[[LOADE2:.+]] = load i64, ptr %[[EADDR]] + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADE2]] to i35 + // CHECK: %[[CONV:.+]] = trunc i35 %[[LOADEDV]] to i32 // CHECK: %[[EXT:.+]] = zext i32 %[[CONV]] to i35 - // CHECK: %[[CHECK:.+]] = icmp eq i35 %[[EXT]], %[[LOADE2]] + // CHECK: %[[CHECK:.+]] = icmp eq i35 %[[EXT]], %[[LOADEDV]] // CHECK: br i1 %[[CHECK]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort @@ -77,43 +80,49 @@ void UIntTruncation(unsigned _BitInt(35) E, unsigned int i, unsigned long long l void IntTruncation(_BitInt(35) E, unsigned _BitInt(42) UE, int i, unsigned j) { j = E; - // CHECK: %[[LOADE:.+]] = load i35 - // CHECK: store i35 %[[LOADE]], ptr %[[EADDR:.+]] - // CHECK: %[[LOADE2:.+]] = load i35, ptr %[[EADDR]] - // CHECK: %[[CONV:.+]] = trunc i35 %[[LOADE2]] to i32 + // CHECK: %[[LOADE:.+]] = load i64 + // CHECK: %[[E1:.+]] = trunc i64 %[[LOADE]] to i35 + // CHECK: %[[STOREDV:.+]] = sext i35 %[[E1]] to i64 + // CHECK: store i64 %[[STOREDV]], ptr %[[EADDR:.+]] + // CHECK: %[[LOADE2:.+]] = load i64, ptr %[[EADDR]] + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADE2]] to i35 + // CHECK: %[[CONV:.+]] = trunc i35 %[[LOADEDV]] to i32 // CHECK: %[[EXT:.+]] = zext i32 %[[CONV]] to i35 - // CHECK: %[[CHECK:.+]] = icmp eq i35 %[[EXT]], %[[LOADE2]] + // CHECK: %[[CHECK:.+]] = icmp eq i35 %[[EXT]], %[[LOADEDV]] // CHECK: br i1 %[[CHECK]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort j = UE; - // CHECK: %[[LOADUE:.+]] = load i42 - // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADUE]] to i32 + // CHECK: %[[LOADUE:.+]] = load i64 + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADUE]] to i42 + // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADEDV]] to i32 // CHECK: %[[EXT:.+]] = zext i32 %[[CONV]] to i42 - // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADUE]] + // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADEDV]] // CHECK: br i1 %[[CHECK]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort // Note: also triggers sign change check. i = UE; - // CHECK: %[[LOADUE:.+]] = load i42 - // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADUE]] to i32 + // CHECK: %[[LOADUE:.+]] = load i64 + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADUE]] to i42 + // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADEDV]] to i32 // CHECK: %[[NEG:.+]] = icmp slt i32 %[[CONV]], 0 // CHECK: %[[SIGNCHECK:.+]] = icmp eq i1 false, %[[NEG]] // CHECK: %[[EXT:.+]] = sext i32 %[[CONV]] to i42 - // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADUE]] + // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADEDV]] // CHECK: %[[CHECKBOTH:.+]] = and i1 %[[SIGNCHECK]], %[[CHECK]] // CHECK: br i1 %[[CHECKBOTH]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort // Note: also triggers sign change check. E = UE; - // CHECK: %[[LOADUE:.+]] = load i42 - // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADUE]] to i35 + // CHECK: %[[LOADUE:.+]] = load i64 + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADUE]] to i42 + // CHECK: %[[CONV:.+]] = trunc i42 %[[LOADEDV]] to i35 // CHECK: %[[NEG:.+]] = icmp slt i35 %[[CONV]], 0 // CHECK: %[[SIGNCHECK:.+]] = icmp eq i1 false, %[[NEG]] // CHECK: %[[EXT:.+]] = sext i35 %[[CONV]] to i42 - // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADUE]] + // CHECK: %[[CHECK:.+]] = icmp eq i42 %[[EXT]], %[[LOADEDV]] // CHECK: %[[CHECKBOTH:.+]] = and i1 %[[SIGNCHECK]], %[[CHECK]] // CHECK: br i1 %[[CHECKBOTH]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort @@ -122,19 +131,24 @@ void IntTruncation(_BitInt(35) E, unsigned _BitInt(42) UE, int i, unsigned j) { // CHECK: define{{.*}} void @_Z15SignChangeCheckDU39_DB39_ void SignChangeCheck(unsigned _BitInt(39) UE, _BitInt(39) E) { UE = E; - // CHECK: %[[LOADEU:.+]] = load i39 - // CHECK: %[[LOADE:.+]] = load i39 - // CHECK: store i39 %[[LOADE]], ptr %[[EADDR:.+]] - // CHECK: %[[LOADE2:.+]] = load i39, ptr %[[EADDR]] - // CHECK: %[[NEG:.+]] = icmp slt i39 %[[LOADE2]], 0 + // CHECK: %[[LOADEU:.+]] = load i64 + // CHECK: %[[LOADE:.+]] = load i64 + // CHECK: %[[LOADEDV:.+]] = trunc i64 %[[LOADE]] to i39 + // CHECK: %[[STOREDV:.+]] = sext i39 %[[LOADEDV]] to i64 + // CHECK: store i64 %[[STOREDV]], ptr %[[EADDR:.+]] + // CHECK: %[[LOADE2:.+]] = load i64, ptr %[[EADDR]] + // CHECK: %[[LOADEDV2:.+]] = trunc i64 %[[LOADE2]] to i39 + // CHECK: %[[NEG:.+]] = icmp slt i39 %[[LOADEDV2]], 0 // CHECK: %[[SIGNCHECK:.+]] = icmp eq i1 %[[NEG]], false // CHECK: br i1 %[[SIGNCHECK]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort E = UE; - // CHECK: store i39 %[[LOADE2]], ptr %[[UEADDR:.+]] - // CHECK: %[[LOADUE2:.+]] = load i39, ptr %[[UEADDR]] - // CHECK: %[[NEG:.+]] = icmp slt i39 %[[LOADUE2]], 0 + // CHECK: %[[STOREDV2:.+]] = zext i39 %[[LOADEDV2]] to i64 + // CHECK: store i64 %[[STOREDV2]], ptr %[[UEADDR:.+]] + // CHECK: %[[LOADUE2:.+]] = load i64, ptr %[[UEADDR]] + // CHECK: %[[LOADEDV3:.+]] = trunc i64 %[[LOADUE2]] to i39 + // CHECK: %[[NEG:.+]] = icmp slt i39 %[[LOADEDV3]], 0 // CHECK: %[[SIGNCHECK:.+]] = icmp eq i1 false, %[[NEG]] // CHECK: br i1 %[[SIGNCHECK]] // CHECK: call void @__ubsan_handle_implicit_conversion_abort @@ -145,12 +159,14 @@ void DivByZero(_BitInt(11) E, int i) { // Also triggers signed integer overflow. E / E; - // CHECK: %[[EADDR:.+]] = alloca i11 - // CHECK: %[[E:.+]] = load i11, ptr %[[EADDR]] - // CHECK: %[[E2:.+]] = load i11, ptr %[[EADDR]] - // CHECK: %[[NEZERO:.+]] = icmp ne i11 %[[E2]], 0 - // CHECK: %[[NEMIN:.+]] = icmp ne i11 %[[E]], -1024 - // CHECK: %[[NENEG1:.+]] = icmp ne i11 %[[E2]], -1 + // CHECK: %[[EADDR:.+]] = alloca i16 + // CHECK: %[[E:.+]] = load i16, ptr %[[EADDR]] + // CHECK: %[[LOADEDE:.+]] = trunc i16 %[[E]] to i11 + // CHECK: %[[E2:.+]] = load i16, ptr %[[EADDR]] + // CHECK: %[[LOADEDE2:.+]] = trunc i16 %[[E2]] to i11 + // CHECK: %[[NEZERO:.+]] = icmp ne i11 %[[LOADEDE2]], 0 + // CHECK: %[[NEMIN:.+]] = icmp ne i11 %[[LOADEDE]], -1024 + // CHECK: %[[NENEG1:.+]] = icmp ne i11 %[[LOADEDE2]], -1 // CHECK: %[[OR:.+]] = or i1 %[[NEMIN]], %[[NENEG1]] // CHECK: %[[AND:.+]] = and i1 %[[NEZERO]], %[[OR]] // CHECK: br i1 %[[AND]] @@ -162,20 +178,23 @@ void DivByZero(_BitInt(11) E, int i) { // CHECK: define{{.*}} void @_Z6ShiftsDB9_ void Shifts(_BitInt(9) E) { E >> E; - // CHECK: %[[EADDR:.+]] = alloca i9 - // CHECK: %[[LHSE:.+]] = load i9, ptr %[[EADDR]] - // CHECK: %[[RHSE:.+]] = load i9, ptr %[[EADDR]] - // CHECK: %[[CMP:.+]] = icmp ule i9 %[[RHSE]], 8 + // CHECK: %[[EADDR:.+]] = alloca i16 + // CHECK: %[[LHSE:.+]] = load i16, ptr %[[EADDR]] + // CHECK: %[[RHSE:.+]] = load i16, ptr %[[EADDR]] + // CHECK: %[[LOADED:.+]] = trunc i16 %[[RHSE]] to i9 + // CHECK: %[[CMP:.+]] = icmp ule i9 %[[LOADED]], 8 // CHECK: br i1 %[[CMP]] // CHECK: call void @__ubsan_handle_shift_out_of_bounds_abort E << E; - // CHECK: %[[LHSE:.+]] = load i9, ptr - // CHECK: %[[RHSE:.+]] = load i9, ptr - // CHECK: %[[CMP:.+]] = icmp ule i9 %[[RHSE]], 8 + // CHECK: %[[LHSE:.+]] = load i16, ptr + // CHECK: %[[LOADEDL:.+]] = trunc i16 %[[LHSE]] to i9 + // CHECK: %[[RHSE:.+]] = load i16, ptr + // CHECK: %[[LOADED:.+]] = trunc i16 %[[RHSE]] to i9 + // CHECK: %[[CMP:.+]] = icmp ule i9 %[[LOADED]], 8 // CHECK: br i1 %[[CMP]] - // CHECK: %[[ZEROS:.+]] = sub nuw nsw i9 8, %[[RHSE]] - // CHECK: %[[CHECK:.+]] = lshr i9 %[[LHSE]], %[[ZEROS]] + // CHECK: %[[ZEROS:.+]] = sub nuw nsw i9 8, %[[LOADED]] + // CHECK: %[[CHECK:.+]] = lshr i9 %[[LOADEDL]], %[[ZEROS]] // CHECK: %[[SKIPSIGN:.+]] = lshr i9 %[[CHECK]], 1 // CHECK: %[[CHECK:.+]] = icmp eq i9 %[[SKIPSIGN]] // CHECK: %[[PHI:.+]] = phi i1 [ true, %{{.+}} ], [ %[[CHECK]], %{{.+}} ] @@ -188,11 +207,15 @@ void SignedIntegerOverflow(_BitInt(93) BiggestE, _BitInt(4) SmallestE, _BitInt(31) JustRightE) { BiggestE + BiggestE; - // CHECK: %[[LOADBIGGESTE2:.+]] = load i93 - // CHECK: store i93 %[[LOADBIGGESTE2]], ptr %[[BIGGESTEADDR:.+]] - // CHECK: %[[LOAD1:.+]] = load i93, ptr %[[BIGGESTEADDR]] - // CHECK: %[[LOAD2:.+]] = load i93, ptr %[[BIGGESTEADDR]] - // CHECK: %[[OFCALL:.+]] = call { i93, i1 } @llvm.sadd.with.overflow.i93(i93 %[[LOAD1]], i93 %[[LOAD2]]) + // CHECK: %[[LOADBIGGESTE2:.+]] = load i128 + // CHECK: %[[LOADEDV:.+]] = trunc i128 %[[LOADBIGGESTE2]] to i93 + // CHECK: %[[STOREDV:.+]] = sext i93 %[[LOADEDV]] to i128 + // CHECK: store i128 %[[STOREDV]], ptr %[[BIGGESTEADDR:.+]] + // CHECK: %[[LOAD1:.+]] = load i128, ptr %[[BIGGESTEADDR]] + // CHECK: %[[LOADEDV1:.+]] = trunc i128 %[[LOAD1]] to i93 + // CHECK: %[[LOAD2:.+]] = load i128, ptr %[[BIGGESTEADDR]] + // CHECK: %[[LOADEDV2:.+]] = trunc i128 %[[LOAD2]] to i93 + // CHECK: %[[OFCALL:.+]] = call { i93, i1 } @llvm.sadd.with.overflow.i93(i93 %[[LOADEDV1]], i93 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i93, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i93, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -200,9 +223,11 @@ void SignedIntegerOverflow(_BitInt(93) BiggestE, // CHECK: call void @__ubsan_handle_add_overflow_abort SmallestE - SmallestE; - // CHECK: %[[LOAD1:.+]] = load i4, ptr - // CHECK: %[[LOAD2:.+]] = load i4, ptr - // CHECK: %[[OFCALL:.+]] = call { i4, i1 } @llvm.ssub.with.overflow.i4(i4 %[[LOAD1]], i4 %[[LOAD2]]) + // CHECK: %[[LOAD1:.+]] = load i8, ptr + // CHECK: %[[LOADEDV1:.+]] = trunc i8 %[[LOAD1]] to i4 + // CHECK: %[[LOAD2:.+]] = load i8, ptr + // CHECK: %[[LOADEDV2:.+]] = trunc i8 %[[LOAD2]] to i4 + // CHECK: %[[OFCALL:.+]] = call { i4, i1 } @llvm.ssub.with.overflow.i4(i4 %[[LOADEDV1]], i4 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i4, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i4, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -210,9 +235,11 @@ void SignedIntegerOverflow(_BitInt(93) BiggestE, // CHECK: call void @__ubsan_handle_sub_overflow_abort JustRightE * JustRightE; - // CHECK: %[[LOAD1:.+]] = load i31, ptr - // CHECK: %[[LOAD2:.+]] = load i31, ptr - // CHECK: %[[OFCALL:.+]] = call { i31, i1 } @llvm.smul.with.overflow.i31(i31 %[[LOAD1]], i31 %[[LOAD2]]) + // CHECK: %[[LOAD1:.+]] = load i32, ptr + // CHECK: %[[LOADEDV1:.+]] = trunc i32 %[[LOAD1]] to i31 + // CHECK: %[[LOAD2:.+]] = load i32, ptr + // CHECK: %[[LOADEDV2:.+]] = trunc i32 %[[LOAD2]] to i31 + // CHECK: %[[OFCALL:.+]] = call { i31, i1 } @llvm.smul.with.overflow.i31(i31 %[[LOADEDV1]], i31 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i31, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i31, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -225,10 +252,11 @@ void UnsignedIntegerOverflow(unsigned u, unsigned _BitInt(23) SmallE, unsigned _BitInt(35) BigE) { u = SmallE + SmallE; - // CHECK: %[[BIGGESTEADDR:.+]] = alloca i23 - // CHECK: %[[LOADE1:.+]] = load i23, ptr %[[BIGGESTEADDR]] - // CHECK: %[[LOADE2:.+]] = load i23, ptr %[[BIGGESTEADDR]] - // CHECK: %[[OFCALL:.+]] = call { i23, i1 } @llvm.uadd.with.overflow.i23(i23 %[[LOADE1]], i23 %[[LOADE2]]) + // CHECK: %[[LOADE1:.+]] = load i32, ptr + // CHECK-NEXT: %[[LOADEDV1:.+]] = trunc i32 %[[LOADE1]] to i23 + // CHECK: %[[LOADE2:.+]] = load i32, ptr + // CHECK-NEXT: %[[LOADEDV2:.+]] = trunc i32 %[[LOADE2]] to i23 + // CHECK: %[[OFCALL:.+]] = call { i23, i1 } @llvm.uadd.with.overflow.i23(i23 %[[LOADEDV1]], i23 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i23, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i23, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -246,9 +274,11 @@ void UnsignedIntegerOverflow(unsigned u, // CHECK: call void @__ubsan_handle_add_overflow_abort SmallE = SmallE + SmallE; - // CHECK: %[[LOADE1:.+]] = load i23, ptr - // CHECK: %[[LOADE2:.+]] = load i23, ptr - // CHECK: %[[OFCALL:.+]] = call { i23, i1 } @llvm.uadd.with.overflow.i23(i23 %[[LOADE1]], i23 %[[LOADE2]]) + // CHECK: %[[LOADE1:.+]] = load i32, ptr + // CHECK-NEXT: %[[LOADEDV1:.+]] = trunc i32 %[[LOADE1]] to i23 + // CHECK: %[[LOADE2:.+]] = load i32, ptr + // CHECK-NEXT: %[[LOADEDV2:.+]] = trunc i32 %[[LOADE2]] to i23 + // CHECK: %[[OFCALL:.+]] = call { i23, i1 } @llvm.uadd.with.overflow.i23(i23 %[[LOADEDV1]], i23 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i23, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i23, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -256,9 +286,11 @@ void UnsignedIntegerOverflow(unsigned u, // CHECK: call void @__ubsan_handle_add_overflow_abort SmallE = BigE + BigE; - // CHECK: %[[LOADE1:.+]] = load i35, ptr - // CHECK: %[[LOADE2:.+]] = load i35, ptr - // CHECK: %[[OFCALL:.+]] = call { i35, i1 } @llvm.uadd.with.overflow.i35(i35 %[[LOADE1]], i35 %[[LOADE2]]) + // CHECK: %[[LOADE1:.+]] = load i64, ptr + // CHECK-NEXT: %[[LOADEDV1:.+]] = trunc i64 %[[LOADE1]] to i35 + // CHECK: %[[LOADE2:.+]] = load i64, ptr + // CHECK-NEXT: %[[LOADEDV2:.+]] = trunc i64 %[[LOADE2]] to i35 + // CHECK: %[[OFCALL:.+]] = call { i35, i1 } @llvm.uadd.with.overflow.i35(i35 %[[LOADEDV1]], i35 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i35, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i35, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true @@ -266,9 +298,11 @@ void UnsignedIntegerOverflow(unsigned u, // CHECK: call void @__ubsan_handle_add_overflow_abort BigE = BigE + BigE; - // CHECK: %[[LOADE1:.+]] = load i35, ptr - // CHECK: %[[LOADE2:.+]] = load i35, ptr - // CHECK: %[[OFCALL:.+]] = call { i35, i1 } @llvm.uadd.with.overflow.i35(i35 %[[LOADE1]], i35 %[[LOADE2]]) + // CHECK: %[[LOADE1:.+]] = load i64, ptr + // CHECK-NEXT: %[[LOADEDV1:.+]] = trunc i64 %[[LOADE1]] to i35 + // CHECK: %[[LOADE2:.+]] = load i64, ptr + // CHECK-NEXT: %[[LOADEDV2:.+]] = trunc i64 %[[LOADE2]] to i35 + // CHECK: %[[OFCALL:.+]] = call { i35, i1 } @llvm.uadd.with.overflow.i35(i35 %[[LOADEDV1]], i35 %[[LOADEDV2]]) // CHECK: %[[EXRESULT:.+]] = extractvalue { i35, i1 } %[[OFCALL]], 0 // CHECK: %[[OFRESULT:.+]] = extractvalue { i35, i1 } %[[OFCALL]], 1 // CHECK: %[[CHECK:.+]] = xor i1 %[[OFRESULT]], true diff --git a/clang/test/CodeGen/ext-int.c b/clang/test/CodeGen/ext-int.c index 4cb399d108f290..a841daff72e081 100644 --- a/clang/test/CodeGen/ext-int.c +++ b/clang/test/CodeGen/ext-int.c @@ -1,11 +1,24 @@ -// RUN: %clang_cc1 -triple x86_64-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK64 -// RUN: %clang_cc1 -triple x86_64-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK64 -// RUN: %clang_cc1 -triple i386-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN32 -// RUN: %clang_cc1 -triple i386-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN32 +// RUN: %clang_cc1 -std=c23 -triple x86_64-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK64,LIN64 +// RUN: %clang_cc1 -std=c23 -triple x86_64-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK64,WIN64 +// RUN: %clang_cc1 -std=c23 -triple i386-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN32 +// RUN: %clang_cc1 -std=c23 -triple i386-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN32 + +// CHECK64: %struct.S1 = type { i32, [4 x i8], [24 x i8] } +// WIN32: %struct.S1 = type { i32, [4 x i8], [24 x i8] } +// LIN32: %struct.S1 = type { i32, [20 x i8] } +// CHECK64: %struct.S2 = type { [40 x i8], i32, [4 x i8] } +// WIN32: %struct.S2 = type { [40 x i8], i32, [4 x i8] } +// LIN32: %struct.S2 = type { [36 x i8], i32 } +// LIN64: %struct.S3 = type { [17 x i8], [7 x i8] } +// WIN64: %struct.S3 = type { [24 x i8] } //GH62207 unsigned _BitInt(1) GlobSize1 = 0; -// CHECK: @GlobSize1 = {{.*}}global i1 false +// CHECK: @GlobSize1 = {{.*}}global i8 0 + +// CHECK64: @__const.foo.A = private unnamed_addr constant { i32, [4 x i8], <{ i8, [23 x i8] }> } { i32 1, [4 x i8] undef, <{ i8, [23 x i8] }> <{ i8 -86, [23 x i8] zeroinitializer }> }, align 8 +// @BigGlob = global [40 x i8] c"\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF\FF", align 8 +// CHECK64: @f.p = internal global <{ i8, i8, [22 x i8] }> <{ i8 16, i8 39, [22 x i8] zeroinitializer }>, align 8 void GenericTest(_BitInt(3) a, unsigned _BitInt(3) b, _BitInt(4) c) { // CHECK: define {{.*}}void @GenericTest @@ -43,7 +56,7 @@ void OffsetOfTest(void) { int B = __builtin_offsetof(struct S,B); // CHECK64: store i32 8, ptr %{{.+}} // LIN32: store i32 4, ptr %{{.+}} - // WINCHECK32: store i32 8, ptr %{{.+}} + // WIN32: store i32 8, ptr %{{.+}} int C = __builtin_offsetof(struct S,C); // CHECK64: store i32 24, ptr %{{.+}} // LIN32: store i32 20, ptr %{{.+}} @@ -52,13 +65,149 @@ void OffsetOfTest(void) { void Size1ExtIntParam(unsigned _BitInt(1) A) { // CHECK: define {{.*}}void @Size1ExtIntParam(i1{{.*}} %[[PARAM:.+]]) - // CHECK: %[[PARAM_ADDR:.+]] = alloca i1 - // CHECK: %[[B:.+]] = alloca [5 x i1] - // CHECK: store i1 %[[PARAM]], ptr %[[PARAM_ADDR]] + // CHECK: %[[PARAM_ADDR:.+]] = alloca i8 + // CHECK: %[[B:.+]] = alloca [5 x i8] + // CHECK: %[[STOREDV:.+]] = zext i1 %[[PARAM]] to i8 + // CHECK: store i8 %[[STOREDV]], ptr %[[PARAM_ADDR]] unsigned _BitInt(1) B[5]; - // CHECK: %[[PARAM_LOAD:.+]] = load i1, ptr %[[PARAM_ADDR]] - // CHECK: %[[IDX:.+]] = getelementptr inbounds [5 x i1], ptr %[[B]] - // CHECK: store i1 %[[PARAM_LOAD]], ptr %[[IDX]] + // CHECK: %[[PARAM_LOAD:.+]] = load i8, ptr %[[PARAM_ADDR]] + // CHECK: %[[LOADEDV:.+]] = trunc i8 %0 to i1 + // CHECK: %[[IDX:.+]] = getelementptr inbounds [5 x i8], ptr %[[B]] + // CHECK: %[[STOREDV1:.+]] = zext i1 %[[LOADEDV]] to i8 + // CHECK: store i8 %[[STOREDV1]], ptr %[[IDX]] B[2] = A; } + +#if __BITINT_MAXWIDTH__ > 128 +struct S1 { + _BitInt(17) A; + _BitInt(129) B; +}; + +int foo(int a) { + // CHECK: %A1 = getelementptr inbounds %struct.S1, ptr %B, i32 0, i32 0 + // CHECK: store i32 1, ptr %A1 + // CHECK64: %B2 = getelementptr inbounds %struct.S1, ptr %B, i32 0, i32 2 + // WIN32: %B2 = getelementptr inbounds %struct.S1, ptr %B, i32 0, i32 2 + // LIN32: %B2 = getelementptr inbounds %struct.S1, ptr %B, i32 0, i32 1 + // CHECK: %0 = load i32, ptr %a.addr, align 4 + // CHECK: %conv = sext i32 %0 to i129 + // CHECK64: storedv = sext i129 %conv to i192 + // WIN32: storedv = sext i129 %conv to i192 + // LIN32: storedv = sext i129 %conv to i160 + // CHECK64: store i192 %storedv, ptr %B2, align 8 + // WIN32: store i192 %storedv, ptr %B2, align 8 + // LIN32: store i160 %storedv, ptr %B2, align 4 + // CHECK64: %B3 = getelementptr inbounds %struct.S1, ptr %A, i32 0, i32 2 + // WIN32: %B3 = getelementptr inbounds %struct.S1, ptr %A, i32 0, i32 2 + // LIN32: %B3 = getelementptr inbounds %struct.S1, ptr %A, i32 0, i32 1 + // CHECK64: %1 = load i192, ptr %B3, align 8 + // WIN32: %1 = load i192, ptr %B3, align 8 + // LIN32: %1 = load i160, ptr %B3, align 4 + // CHECK64: %loadedv = trunc i192 %1 to i129 + // WIN32: %loadedv = trunc i192 %1 to i129 + // LIN32: %loadedv = trunc i160 %1 to i129 + // CHECK: %conv4 = trunc i129 %loadedv to i32 + struct S1 A = {1, 170}; + struct S1 B = {1, a}; + return (int)A.B + (int)B.B; +} + +struct S2 { + _BitInt(257) A; + int B; +}; + +_BitInt(257) bar() { + // CHECK64: define {{.*}}void @bar(ptr {{.*}} sret([40 x i8]) align 8 %[[RET:.+]]) + // CHECK64: %A = alloca %struct.S2, align 8 + // CHECK64: %0 = getelementptr inbounds { <{ i8, [39 x i8] }>, i32, [4 x i8] }, ptr %A, i32 0, i32 0 + // CHECK64: %1 = getelementptr inbounds <{ i8, [39 x i8] }>, ptr %0, i32 0, i32 0 + // CHECK64: store i8 1, ptr %1, align 8 + // CHECK64: %2 = getelementptr inbounds { <{ i8, [39 x i8] }>, i32, [4 x i8] }, ptr %A, i32 0, i32 1 + // CHECK64: store i32 10000, ptr %2, align 8 + // CHECK64: %A1 = getelementptr inbounds %struct.S2, ptr %A, i32 0, i32 0 + // CHECK64: %3 = load i320, ptr %A1, align 8 + // CHECK64: %loadedv = trunc i320 %3 to i257 + // CHECK64: %storedv = sext i257 %loadedv to i320 + // CHECK64: store i320 %storedv, ptr %[[RET]], align 8 + struct S2 A = {1, 10000}; + return A.A; +} + +void TakesVarargs(int i, ...) { + // CHECK64: define{{.*}} void @TakesVarargs(i32 +__builtin_va_list args; +__builtin_va_start(args, i); + +_BitInt(160) A = __builtin_va_arg(args, _BitInt(160)); + // CHECK64: %[[ARG:.+]] = load i192 + // CHECK64: %[[TRUNC:.+]] = trunc i192 %[[ARG]] to i160 + // CHECK64: %[[SEXT:.+]] = sext i160 %[[TRUNC]] to i192 + // CHECK64: store i192 %[[SEXT]], ptr %A, align 8 +} + +_BitInt(129) *f1(_BitInt(129) *p) { + // CHECK64: getelementptr inbounds [24 x i8], {{.*}} i64 1 + return p + 1; +} + +char *f2(char *p) { + // CHECK64: getelementptr inbounds i8, {{.*}} i64 24 + return p + sizeof(_BitInt(129)); +} + +auto BigGlob = (_BitInt(257))-1; +// CHECK64: define {{.*}}void @foobar(ptr {{.*}} sret([40 x i8]) align 8 %[[RET1:.+]]) +_BitInt(257) foobar() { + // CHECK64: %A = alloca [40 x i8], align 8 + // CHECK64: %0 = load i320, ptr @BigGlob, align 8 + // CHECK64: %loadedv = trunc i320 %0 to i257 + // CHECK64: %add = add nsw i257 %loadedv, 1 + // CHECK64: %storedv = sext i257 %add to i320 + // CHECK64: store i320 %storedv, ptr %A, align 8 + // CHECK64: %1 = load i320, ptr %A, align 8 + // CHECK64: %loadedv1 = trunc i320 %1 to i257 + // CHECK64: %storedv2 = sext i257 %loadedv1 to i320 + // CHECK64: store i320 %storedv2, ptr %[[RET1]], align 8 + _BitInt(257) A = BigGlob + 1; + return A; +} + +void f() { + static _BitInt(130) p = {10000}; +} + +struct S3 { + _BitInt (136) A : 129; +}; + +void bitField() { + struct S3 s = {1}; + struct { + _BitInt (136) A : 48; + int a; + } s1 = {s.A}; + s1.A = 36; + // LIN64: %s = alloca %struct.S3, align 8 + // LIN64: %s1 = alloca %struct.anon, align 8 + // LIN64: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.bitField.s, i64 24, i1 false) + // LIN64: %bf.load = load i136, ptr %s, align 8 + // LIN64: %bf.shl = shl i136 %bf.load, 7 + // LIN64: %bf.ashr = ashr i136 %bf.shl, 7 + // LIN64: %0 = trunc i136 %bf.ashr to i64 + // LIN64: %bf.load1 = load i64, ptr %s1, align 8 + // LIN64: %bf.value = and i64 %0, 281474976710655 + // LIN64: %bf.clear = and i64 %bf.load1, -281474976710656 + // LIN64: %bf.set = or i64 %bf.clear, %bf.value + // LIN64: store i64 %bf.set, ptr %s1, align 8 + // LIN64: %a = getelementptr inbounds %struct.anon, ptr %s1, i32 0, i32 1 + // LIN64: store i32 0, ptr %a, align 8 + // LIN64: %bf.load2 = load i64, ptr %s1, align 8 + // LIN64: %bf.clear3 = and i64 %bf.load2, -281474976710656 + // LIN64: %bf.set4 = or i64 %bf.clear3, 36 + // LIN64: store i64 %bf.set4, ptr %s1, align 8 +} + +#endif diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c index 0749523b9ab3d5..2cb56d35af21dc 100644 --- a/clang/test/CodeGen/extend-arg-64.c +++ b/clang/test/CodeGen/extend-arg-64.c @@ -68,7 +68,8 @@ int test(void) { // CHECKEXT-NEXT: call void (i64, ...) @knr knr(ei23); - // CHECKEXT: load i23, ptr @ei23 + // CHECKEXT: load i32, ptr @ei23 + // CHECKEXT: trunc i32 // CHECKEXT-NEXT: call void (i23, ...) @knr knr(ff); diff --git a/clang/test/CodeGen/ifunc.c b/clang/test/CodeGen/ifunc.c index 3aa29f7dff74de..58a00ada687cb0 100644 --- a/clang/test/CodeGen/ifunc.c +++ b/clang/test/CodeGen/ifunc.c @@ -7,11 +7,12 @@ // RUN: %clang_cc1 -triple x86_64-apple-macosx -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple arm64-apple-macosx -O2 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple x86_64-apple-macosx -O2 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple arm64-apple-macosx -fsanitize=thread -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=MACSAN -// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsanitize=thread -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=MACSAN -// RUN: %clang_cc1 -triple arm64-apple-macosx -fsanitize=address -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=MACSAN -// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsanitize=address -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=MACSAN +// RUN: %clang_cc1 -triple arm64-apple-macosx -fsanitize=thread -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=SAN +// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsanitize=thread -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=SAN +// RUN: %clang_cc1 -triple arm64-apple-macosx -fsanitize=address -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=SAN +// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsanitize=address -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=SAN +/// The ifunc is emitted before its resolver. int foo(int) __attribute__ ((ifunc("foo_ifunc"))); static int f1(int i) { @@ -45,20 +46,23 @@ extern void goo(void) __attribute__ ((ifunc("goo_ifunc"))); void* goo_ifunc(void) { return 0; } + +/// The ifunc is emitted after its resolver. +void *hoo_ifunc(void) { return 0; } +extern void hoo(int) __attribute__ ((ifunc("hoo_ifunc"))); + // CHECK: @foo = ifunc i32 (i32), ptr @foo_ifunc // CHECK: @goo = ifunc void (), ptr @goo_ifunc +// CHECK: @hoo = ifunc void (i32), ptr @hoo_ifunc // CHECK: call i32 @foo(i32 // CHECK: call void @goo() -// SAN: define internal nonnull {{(noundef )?}}ptr @foo_ifunc() #[[#FOO_IFUNC:]] { -// MACSAN: define internal nonnull {{(noundef )?}}ptr @foo_ifunc() #[[#FOO_IFUNC:]] { +// SAN: define {{(dso_local )?}}noalias {{(noundef )?}}ptr @goo_ifunc() #[[#GOO_IFUNC:]] { -// SAN: define dso_local noalias {{(noundef )?}}ptr @goo_ifunc() #[[#GOO_IFUNC:]] { -// MACSAN: define noalias {{(noundef )?}}ptr @goo_ifunc() #[[#GOO_IFUNC:]] { +// SAN: define {{(dso_local )?}}noalias {{(noundef )?}}ptr @hoo_ifunc() #[[#GOO_IFUNC]] { -// SAN-DAG: attributes #[[#FOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}} -// MACSAN-DAG: attributes #[[#FOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}} +// SAN: define internal {{(noundef )?}}nonnull ptr @foo_ifunc() #[[#FOO_IFUNC:]] { // SAN-DAG: attributes #[[#GOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}} -// MACSAN-DAG: attributes #[[#GOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}} +// SAN-DAG: attributes #[[#FOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}} diff --git a/clang/test/CodeGen/kcfi.c b/clang/test/CodeGen/kcfi.c index f6b2e4b398aa7c..622843cedba50f 100644 --- a/clang/test/CodeGen/kcfi.c +++ b/clang/test/CodeGen/kcfi.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -o - %s | FileCheck %s --check-prefixes=CHECK,C // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -x c++ -o - %s | FileCheck %s --check-prefixes=CHECK,MEMBER // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fpatchable-function-entry-offset=3 -o - %s | FileCheck %s --check-prefixes=CHECK,OFFSET #if !__has_feature(kcfi) @@ -10,6 +10,9 @@ // CHECK: module asm ".set __kcfi_typeid_[[F4]], [[#%d,HASH:]]" /// Must not __kcfi_typeid symbols for non-address-taken declarations // CHECK-NOT: module asm ".weak __kcfi_typeid_{{f6|_Z2f6v}}" + +// C: @ifunc1 = ifunc i32 (i32), ptr @resolver1 +// C: @ifunc2 = ifunc i64 (i64), ptr @resolver2 typedef int (*fn_t)(void); // CHECK: define dso_local{{.*}} i32 @{{f1|_Z2f1v}}(){{.*}} !kcfi_type ![[#TYPE:]] @@ -45,6 +48,16 @@ static int f5(void) { return 2; } // CHECK-DAG: declare !kcfi_type ![[#TYPE]]{{.*}} i32 @{{f6|_Z2f6v}}() extern int f6(void); +#ifndef __cplusplus +// C: define internal ptr @resolver1() #[[#]] !kcfi_type ![[#]] { +int ifunc1(int) __attribute__((ifunc("resolver1"))); +static void *resolver1(void) { return 0; } + +// C: define internal ptr @resolver2() #[[#]] !kcfi_type ![[#]] { +static void *resolver2(void) { return 0; } +long ifunc2(long) __attribute__((ifunc("resolver2"))); +#endif + int test(void) { return call(f1) + __call((fn_t)f2) + diff --git a/clang/test/CodeGen/paren-list-agg-init.cpp b/clang/test/CodeGen/paren-list-agg-init.cpp index 88b1834d42d879..16cfe772a4ae59 100644 --- a/clang/test/CodeGen/paren-list-agg-init.cpp +++ b/clang/test/CodeGen/paren-list-agg-init.cpp @@ -48,14 +48,13 @@ struct E { ~E() {}; }; -// CHECK-DAG: [[STRUCT_F:%.*]] = type { i8 } struct F { F (int i = 1); F (const F &f) = delete; F (F &&f) = default; }; -// CHECK-DAG: [[STRUCT_G:%.*]] = type <{ i32, [[STRUCT_F]], [3 x i8] }> +// CHECK-DAG: [[STRUCT_G:%.*]] = type <{ i32, [4 x i8] }> struct G { int a; F f; @@ -78,12 +77,12 @@ namespace gh61145 { ~Vec(); }; - // CHECK-DAG: [[STRUCT_S1:%.*]] = type { [[STRUCT_VEC]] } + // CHECK-DAG: [[STRUCT_S1:%.*]] = type { i8 } struct S1 { Vec v; }; - // CHECK-DAG: [[STRUCT_S2:%.*]] = type { [[STRUCT_VEC]], i8 } + // CHECK-DAG: [[STRUCT_S2:%.*]] = type { i8, i8 } struct S2 { Vec v; char c; @@ -377,7 +376,7 @@ void foo18() { // CHECK-NEXT: [[G:%.*g.*]] = alloca [[STRUCT_G]], align 4 // CHECK-NEXT: [[A:%.*a.*]] = getelementptr inbounds [[STRUCT_G]], ptr [[G]], i32 0, i32 0 // CHECK-NEXT: store i32 2, ptr [[A]], align 4 -// CHECK-NEXT: [[F:%.*f.*]] = getelementptr inbounds [[STRUCT_G]], ptr [[G]], i32 0, i32 1 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds i8, ptr [[G]], i64 4 // CHECk-NEXT: call void @{{.*F.*}}(ptr noundef nonnull align 1 dereferenceable(1)) [[F]], ie32 noundef 1) // CHECK: ret void void foo19() { @@ -392,9 +391,8 @@ namespace gh61145 { // CHECK-NEXT: [[AGG_TMP_ENSURED:%.*agg.tmp.ensured.*]] = alloca [[STRUCT_S1]], align 1 // a.k.a. Vec::Vec() // CHECK-NEXT: call void @_ZN7gh611453VecC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[V]]) - // CHECK-NEXT: [[V1:%.*v1.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[AGG_TMP_ENSURED]], i32 0, i32 0 // a.k.a. Vec::Vec(Vec&&) - // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[V1]], ptr noundef nonnull align 1 dereferenceable(1) [[V]]) + // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]], ptr noundef nonnull align 1 dereferenceable(1) [[V]]) // a.k.a. S1::~S1() // CHECK-NEXT: call void @_ZN7gh611452S1D1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]]) // a.k.a.Vec::~Vec() @@ -413,9 +411,8 @@ namespace gh61145 { // CHECK-NEXT: [[AGG_TMP_ENSURED:%.*agg.tmp.ensured.*]] = alloca [[STRUCT_S2]], align 1 // a.k.a. Vec::Vec() // CHECK-NEXT: call void @_ZN7gh611453VecC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[V]]) - // CHECK-NEXT: [[V1:%.*v1.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[AGG_TMP_ENSURED]], i32 0, i32 0 // a.k.a. Vec::Vec(Vec&&) - // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[V1]], ptr noundef nonnull align 1 dereferenceable(1) [[V]]) + // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]], ptr noundef nonnull align 1 dereferenceable(1) [[V]]) // CHECK-NEXT: [[C:%.*c.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[AGG_TMP_ENSURED]], i32 0, i32 // CHECK-NEXT: store i8 0, ptr [[C]], align 1 // a.k.a. S2::~S2() diff --git a/clang/test/CodeGen/ubsan-shift-bitint.c b/clang/test/CodeGen/ubsan-shift-bitint.c index af65ed60918b08..9e4ec15060b3fd 100644 --- a/clang/test/CodeGen/ubsan-shift-bitint.c +++ b/clang/test/CodeGen/ubsan-shift-bitint.c @@ -5,14 +5,16 @@ // CHECK-LABEL: define{{.*}} i32 @test_left_variable int test_left_variable(unsigned _BitInt(5) b, unsigned _BitInt(2) e) { - // CHECK: [[E_REG:%.+]] = load [[E_SIZE:i2]] + // CHECK: load i8 + // CHECK: [[E_REG:%.+]] = trunc i8 {{.*}} to [[E_SIZE:i2]] // CHECK: icmp ule [[E_SIZE]] [[E_REG]], -1, return b << e; } // CHECK-LABEL: define{{.*}} i32 @test_right_variable int test_right_variable(unsigned _BitInt(2) b, unsigned _BitInt(3) e) { - // CHECK: [[E_REG:%.+]] = load [[E_SIZE:i3]] + // CHECK: load i8 + // CHECK: [[E_REG:%.+]] = trunc i8 {{.*}} to [[E_SIZE:i3]] // CHECK: icmp ule [[E_SIZE]] [[E_REG]], 1, return b >> e; } @@ -37,14 +39,16 @@ int test_right_literal(unsigned _BitInt(2) b) { // CHECK-LABEL: define{{.*}} i32 @test_signed_left_variable int test_signed_left_variable(unsigned _BitInt(15) b, _BitInt(2) e) { - // CHECK: [[E_REG:%.+]] = load [[E_SIZE:i2]] + // CHECK: load i8 + // CHECK: [[E_REG:%.+]] = trunc i8 {{.*}} to [[E_SIZE:i2]] // CHECK: icmp ule [[E_SIZE]] [[E_REG]], 1, return b << e; } // CHECK-LABEL: define{{.*}} i32 @test_signed_right_variable int test_signed_right_variable(unsigned _BitInt(32) b, _BitInt(4) e) { - // CHECK: [[E_REG:%.+]] = load [[E_SIZE:i4]] + // CHECK: load i8 + // CHECK: [[E_REG:%.+]] = trunc i8 {{.*}} to [[E_SIZE:i4]] // CHECK: icmp ule [[E_SIZE]] [[E_REG]], 7, return b >> e; } diff --git a/clang/test/CodeGen/voidptr-vaarg.c b/clang/test/CodeGen/voidptr-vaarg.c index d023ddf0fb5d2d..4f008fd85115a9 100644 --- a/clang/test/CodeGen/voidptr-vaarg.c +++ b/clang/test/CodeGen/voidptr-vaarg.c @@ -245,7 +245,8 @@ typedef struct { // CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 // CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 4, i1 false) -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4 +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_EMPTY_INT_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 // CHECK-NEXT: ret i32 [[TMP0]] // empty_int_t empty_int(__builtin_va_list list) { diff --git a/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp b/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp index 14557829268efd..3efb8c449c8fa9 100644 --- a/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp +++ b/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp @@ -19,8 +19,8 @@ struct S { }; // CHECK: store i32 0, ptr @arr -// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (%struct.S, ptr @arr, i32 0, i32 1), ptr noundef @.str) +// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr @arr, i64 4), ptr noundef @.str) // CHECK: store i32 1, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1) -// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (%struct.S, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1), i32 0, i32 1), ptr noundef @.str.1) +// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1), i64 4), ptr noundef @.str.1) // CHECK: store i32 2, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2) -// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (%struct.S, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2), i32 0, i32 1), ptr noundef @.str.2) +// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2), i64 4), ptr noundef @.str.2) diff --git a/clang/test/CodeGenCXX/bitfield-access-empty.cpp b/clang/test/CodeGenCXX/bitfield-access-empty.cpp index 96047ce4729979..4922ed1e7f3de8 100644 --- a/clang/test/CodeGenCXX/bitfield-access-empty.cpp +++ b/clang/test/CodeGenCXX/bitfield-access-empty.cpp @@ -26,6 +26,7 @@ // RUN: %clang_cc1 -triple=bpf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s // RUN: %clang_cc1 -triple=csky %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s // RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=le64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s // RUN: %clang_cc1 -triple=loongarch32-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s // RUN: %clang_cc1 -triple=nvptx-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s // RUN: %clang_cc1 -triple=riscv32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s @@ -83,8 +84,8 @@ struct P3 { unsigned b : 16; } p3; // CHECK-LABEL: LLVMType:%struct.P3 = -// LAYOUT-SAME: type { i16, %struct.Empty, i16, [2 x i8] } -// LAYOUT-DWN32-SAME: type <{ i16, %struct.Empty, i16 }> +// LAYOUT-SAME: type { i16, [2 x i8], i16, [2 x i8] } +// LAYOUT-DWN32-SAME: type <{ i16, i8, i16 }> // CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P3 = // CHECK: BitFields:[ // LAYOUT-NEXT: diff --git a/clang/test/CodeGenCXX/bitfield-access-tail.cpp b/clang/test/CodeGenCXX/bitfield-access-tail.cpp index fb961f327f2e5c..1539e17cad4369 100644 --- a/clang/test/CodeGenCXX/bitfield-access-tail.cpp +++ b/clang/test/CodeGenCXX/bitfield-access-tail.cpp @@ -26,6 +26,7 @@ // RUN: %clang_cc1 -triple=bpf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT64 %s // RUN: %clang_cc1 -triple=csky %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT32 %s // RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT32 %s +// RUN: %clang_cc1 -triple=le64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT64 %s // RUN: %clang_cc1 -triple=loongarch32-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT32 %s // RUN: %clang_cc1 -triple=nvptx-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT32 %s // RUN: %clang_cc1 -triple=riscv32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT32 %s diff --git a/clang/test/CodeGenCXX/class-layout.cpp b/clang/test/CodeGenCXX/class-layout.cpp index 84b0f887876ac5..90617d25b254ee 100644 --- a/clang/test/CodeGenCXX/class-layout.cpp +++ b/clang/test/CodeGenCXX/class-layout.cpp @@ -83,7 +83,7 @@ namespace Test6 { namespace Test7 { #pragma pack (1) class A {}; - // CHECK: %"class.Test7::B" = type <{ ptr, %"class.Test7::A" }> + // CHECK: %"class.Test7::B" = type <{ ptr, i8 }> class B { virtual ~B(); A a; diff --git a/clang/test/CodeGenCXX/compound-literals.cpp b/clang/test/CodeGenCXX/compound-literals.cpp index fcec2d19e2def0..1b4a1d4445123e 100644 --- a/clang/test/CodeGenCXX/compound-literals.cpp +++ b/clang/test/CodeGenCXX/compound-literals.cpp @@ -20,7 +20,7 @@ int f() { // CHECK: [[LVALUE:%[a-z0-9.]+]] = alloca // CHECK-NEXT: [[I:%[a-z0-9]+]] = getelementptr inbounds {{.*}}, ptr [[LVALUE]], i32 0, i32 0 // CHECK-NEXT: store i32 17, ptr [[I]] - // CHECK-NEXT: [[X:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 0, i32 1 + // CHECK-NEXT: [[X:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 4 // CHECK-NEXT: call noundef ptr @_ZN1XC1EPKc({{.*}}[[X]] // CHECK-NEXT: [[I:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 0, i32 0 // CHECK-NEXT: [[RESULT:%[a-z0-9]+]] = load i32, ptr diff --git a/clang/test/CodeGenCXX/cxx1z-constexpr-if.cpp b/clang/test/CodeGenCXX/cxx1z-constexpr-if.cpp index d14e36406a45e6..5a11afb8dec40a 100644 --- a/clang/test/CodeGenCXX/cxx1z-constexpr-if.cpp +++ b/clang/test/CodeGenCXX/cxx1z-constexpr-if.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++1z %s -emit-llvm -fblocks -triple x86_64-apple-darwin10 -o - | FileCheck %s --implicit-check-not=should_not_be_used +// RUN: %clang_cc1 -std=c++1z %s -emit-llvm -fblocks -triple x86_64-apple-darwin10 -o - -fexperimental-new-constant-interpreter | FileCheck %s --implicit-check-not=should_not_be_used void should_be_used_1(); void should_be_used_2(); diff --git a/clang/test/CodeGenCXX/exceptions.cpp b/clang/test/CodeGenCXX/exceptions.cpp index e8179f9828fb6b..1f4d2f061a43d4 100644 --- a/clang/test/CodeGenCXX/exceptions.cpp +++ b/clang/test/CodeGenCXX/exceptions.cpp @@ -513,8 +513,7 @@ namespace test11 { // CHECK-LABEL: define{{.*}} void @_ZN6test111CC2Ev( // CHECK: [[THIS:%.*]] = load ptr, ptr {{%.*}} // Construct single. - // CHECK-NEXT: [[SINGLE:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[THIS]], i32 0, i32 0 - // CHECK-NEXT: call void @_ZN6test111AC1Ev(ptr {{[^,]*}} [[SINGLE]]) + // CHECK-NEXT: call void @_ZN6test111AC1Ev(ptr {{[^,]*}} [[THIS]]) // Construct array. // CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[THIS]], i32 0, i32 1 // CHECK-NEXT: [[ARRAYBEGIN:%.*]] = getelementptr inbounds [2 x [3 x [[A:%.*]]]], ptr [[ARRAY]], i32 0, i32 0, i32 0 @@ -560,8 +559,8 @@ namespace test11 { // CHECK: br label // Finally, the cleanup for single. - // CHECK98: invoke void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[SINGLE]]) - // CHECK11: call void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[SINGLE]]) + // CHECK98: invoke void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[THIS]]) + // CHECK11: call void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[THIS]]) // CHECK: br label // CHECK: resume diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp index a1d17c840ee460..e58375ca66996e 100644 --- a/clang/test/CodeGenCXX/ext-int.cpp +++ b/clang/test/CodeGenCXX/ext-int.cpp @@ -52,20 +52,20 @@ struct HasBitIntFirst { _BitInt(35) A; int B; }; -// CHECK: %struct.HasBitIntFirst = type { i35, i32 } +// CHECK: %struct.HasBitIntFirst = type { i64, i32 } struct HasBitIntLast { int A; _BitInt(35) B; }; -// CHECK: %struct.HasBitIntLast = type { i32, i35 } +// CHECK: %struct.HasBitIntLast = type { i32, i64 } struct HasBitIntMiddle { int A; _BitInt(35) B; int C; }; -// CHECK: %struct.HasBitIntMiddle = type { i32, i35, i32 } +// CHECK: %struct.HasBitIntMiddle = type { i32, i64, i32 } // Force emitting of the above structs. void StructEmit() { @@ -170,27 +170,35 @@ void TakesVarargs(int i, ...) { // LIN64: %[[FITSINGP:.+]] = icmp ule i32 %[[GPOFFSET]], 32 // LIN64: br i1 %[[FITSINGP]] // LIN64: %[[BC1:.+]] = phi ptr - // LIN64: %[[LOAD1:.+]] = load i92, ptr %[[BC1]] - // LIN64: store i92 %[[LOAD1]], ptr + // LIN64: %[[LOAD1:.+]] = load i128, ptr %[[BC1]] + // LIN64: %[[T:.+]] = trunc i128 %[[LOAD1]] to i92 + // LIN64: %[[S:.+]] = sext i92 %[[T]] to i128 + // LIN64: store i128 %[[S]], ptr // LIN32: %[[CUR1:.+]] = load ptr, ptr %[[ARGS]] // LIN32: %[[NEXT1:.+]] = getelementptr inbounds i8, ptr %[[CUR1]], i32 12 // LIN32: store ptr %[[NEXT1]], ptr %[[ARGS]] - // LIN32: %[[LOADV1:.+]] = load i92, ptr %[[CUR1]] - // LIN32: store i92 %[[LOADV1]], ptr + // LIN32: %[[LOADV1:.+]] = load i96, ptr %[[CUR1]] + // LIN32: %[[TR:.+]] = trunc i96 %[[LOADV1]] to i92 + // LIN32: %[[SEXT:.+]] = sext i92 %[[TR]] to i96 + // LIN32: store i96 %[[SEXT]], ptr // WIN64: %[[CUR1:.+]] = load ptr, ptr %[[ARGS]] // WIN64: %[[NEXT1:.+]] = getelementptr inbounds i8, ptr %[[CUR1]], i64 8 // WIN64: store ptr %[[NEXT1]], ptr %[[ARGS]] // WIN64: %[[LOADP1:.+]] = load ptr, ptr %[[CUR1]] - // WIN64: %[[LOADV1:.+]] = load i92, ptr %[[LOADP1]] - // WIN64: store i92 %[[LOADV1]], ptr + // WIN64: %[[LOADV1:.+]] = load i128, ptr %[[LOADP1]] + // WIN64: %[[TR:.+]] = trunc i128 %[[LOADV1]] to i92 + // WIN64: %[[SEXT:.+]] = sext i92 %[[TR]] to i128 + // WIN64: store i128 %[[SEXT]], ptr // WIN32: %[[CUR1:.+]] = load ptr, ptr %[[ARGS]] // WIN32: %[[NEXT1:.+]] = getelementptr inbounds i8, ptr %[[CUR1]], i32 16 // WIN32: store ptr %[[NEXT1]], ptr %[[ARGS]] - // WIN32: %[[LOADV1:.+]] = load i92, ptr %[[CUR1]] - // WIN32: store i92 %[[LOADV1]], ptr + // WIN32: %[[LOADV1:.+]] = load i128, ptr %[[CUR1]] + // WIN32: %[[TR:.+]] = trunc i128 %[[LOADV1]] to i92 + // WIN32: %[[SEXT:.+]] = sext i92 %[[TR]] to i128 + // WIN32: store i128 %[[SEXT]], ptr _BitInt(31) B = __builtin_va_arg(args, _BitInt(31)); @@ -200,26 +208,34 @@ void TakesVarargs(int i, ...) { // LIN64: %[[FITSINGP:.+]] = icmp ule i32 %[[GPOFFSET]], 40 // LIN64: br i1 %[[FITSINGP]] // LIN64: %[[BC1:.+]] = phi ptr - // LIN64: %[[LOAD1:.+]] = load i31, ptr %[[BC1]] - // LIN64: store i31 %[[LOAD1]], ptr + // LIN64: %[[LOAD1:.+]] = load i32, ptr %[[BC1]] + // LIN64: %[[T:.+]] = trunc i32 %[[LOAD1]] to i31 + // LIN64: %[[S:.+]] = sext i31 %[[T]] to i32 + // LIN64: store i32 %[[S]], ptr // LIN32: %[[CUR2:.+]] = load ptr, ptr %[[ARGS]] // LIN32: %[[NEXT2:.+]] = getelementptr inbounds i8, ptr %[[CUR2]], i32 4 // LIN32: store ptr %[[NEXT2]], ptr %[[ARGS]] - // LIN32: %[[LOADV2:.+]] = load i31, ptr %[[CUR2]] - // LIN32: store i31 %[[LOADV2]], ptr + // LIN32: %[[LOADV2:.+]] = load i32, ptr %[[CUR2]] + // LIN32: %[[T:.+]] = trunc i32 %[[LOADV2]] to i31 + // LIN32: %[[S:.+]] = sext i31 %[[T]] to i32 + // LIN32: store i32 %[[S]], ptr // WIN64: %[[CUR2:.+]] = load ptr, ptr %[[ARGS]] // WIN64: %[[NEXT2:.+]] = getelementptr inbounds i8, ptr %[[CUR2]], i64 8 // WIN64: store ptr %[[NEXT2]], ptr %[[ARGS]] - // WIN64: %[[LOADV2:.+]] = load i31, ptr %[[CUR2]] - // WIN64: store i31 %[[LOADV2]], ptr + // WIN64: %[[LOADV2:.+]] = load i32, ptr %[[CUR2]] + // WIN64: %[[T:.+]] = trunc i32 %[[LOADV2]] to i31 + // WIN64: %[[S:.+]] = sext i31 %[[T]] to i32 + // WIN64: store i32 %[[S]], ptr // WIN32: %[[CUR2:.+]] = load ptr, ptr %[[ARGS]] // WIN32: %[[NEXT2:.+]] = getelementptr inbounds i8, ptr %[[CUR2]], i32 4 // WIN32: store ptr %[[NEXT2]], ptr %[[ARGS]] - // WIN32: %[[LOADV2:.+]] = load i31, ptr %[[CUR2]] - // WIN32: store i31 %[[LOADV2]], ptr + // WIN32: %[[LOADV2:.+]] = load i32, ptr %[[CUR2]] + // WIN32: %[[T:.+]] = trunc i32 %[[LOADV2]] to i31 + // WIN32: %[[S:.+]] = sext i31 %[[T]] to i32 + // WIN32: store i32 %[[S]], ptr _BitInt(16) C = __builtin_va_arg(args, _BitInt(16)); // LIN64: %[[AD3:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]] @@ -297,7 +313,7 @@ void TakesVarargs(int i, ...) { // WIN: store ptr %[[NEXT5]], ptr %[[ARGS]] // WIN64: %[[LOADP5:.+]] = load ptr, ptr %[[CUR5]] // WIN64: %[[LOADV5:.+]] = load <8 x i32>, ptr %[[LOADP5]] - // WIN32: %[[LOADV5:.+]] = load <8 x i32>, ptr %argp.cur7 + // WIN32: %[[LOADV5:.+]] = load <8 x i32>, ptr %argp.cur9 // WIN: store <8 x i32> %[[LOADV5]], ptr __builtin_va_end(args); @@ -503,11 +519,13 @@ void Shift(_BitInt(28) Ext, _BitInt(65) LargeExt, int i) { // CHECK: ashr i32 {{.+}}, %[[PROMO]] Ext << i; + // CHECK: %[[BI:.+]] = trunc i32 %{{.+}} to i28 // CHECK: %[[PROMO:.+]] = trunc i32 %{{.+}} to i28 - // CHECK: shl i28 {{.+}}, %[[PROMO]] + // CHECK: shl i28 %[[BI]], %[[PROMO]] Ext >> i; + // CHECK: %[[BI:.+]] = trunc i32 %{{.+}} to i28 // CHECK: %[[PROMO:.+]] = trunc i32 %{{.+}} to i28 - // CHECK: ashr i28 {{.+}}, %[[PROMO]] + // CHECK: ashr i28 %[[BI]], %[[PROMO]] LargeExt << i; // CHECK: %[[PROMO:.+]] = zext i32 %{{.+}} to i65 @@ -577,7 +595,7 @@ void TBAATest(_BitInt(sizeof(int) * 8) ExtInt, _BitInt(6) Other) { // CHECK-DAG: store i32 %{{.+}}, ptr %{{.+}}, align 4, !tbaa ![[EXTINT_TBAA:.+]] // CHECK-DAG: store i32 %{{.+}}, ptr %{{.+}}, align 4, !tbaa ![[EXTINT_TBAA]] - // CHECK-DAG: store i6 %{{.+}}, ptr %{{.+}}, align 1, !tbaa ![[EXTINT6_TBAA:.+]] + // CHECK-DAG: store i8 %{{.+}}, ptr %{{.+}}, align 1, !tbaa ![[EXTINT6_TBAA:.+]] ExtInt = 5; ExtUInt = 5; Other = 5; diff --git a/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp b/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp index 6236ff2ca66cb7..5f14e3977db00c 100644 --- a/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp +++ b/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp @@ -16,8 +16,7 @@ void foo() { } // CHECK: define{{.*}} void @_Z3foov -// CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 0 -// CHECK-NEXT: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 1 +// CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 1 // CHECK-NEXT: store float 0.000 // CHECK-NEXT: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 2 // CHECK-NEXT: store float 1.000 @@ -27,7 +26,6 @@ void foo() { // The lambda body. Reverse iteration when the captures aren't deterministic // causes these to be laid out differently in the lambda. // CHECK: define internal void -// CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 0 // CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 1 // CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 2 // CHECK: getelementptr inbounds %{{.+}}, ptr %{{.+}}, i32 0, i32 3 diff --git a/clang/test/CodeGenCXX/nullptr.cpp b/clang/test/CodeGenCXX/nullptr.cpp index ca76c55e2122d1..0d8837b216bec1 100644 --- a/clang/test/CodeGenCXX/nullptr.cpp +++ b/clang/test/CodeGenCXX/nullptr.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin10 -I%S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin10 -I%S -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s #include diff --git a/clang/test/CodeGenCXX/partial-destruction.cpp b/clang/test/CodeGenCXX/partial-destruction.cpp index 8ceb4b9bbedeaa..ab6155e6f6a71c 100644 --- a/clang/test/CodeGenCXX/partial-destruction.cpp +++ b/clang/test/CodeGenCXX/partial-destruction.cpp @@ -107,13 +107,12 @@ namespace test1 { // CHECK: [[V:%.*]] = alloca [[B:%.*]], align 4 // CHECK-NEXT: alloca ptr // CHECK-NEXT: alloca i32 - // CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[B]], ptr [[V]], i32 0, i32 0 - // CHECK-NEXT: call void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[X]], i32 noundef 5) - // CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[B]], ptr [[V]], i32 0, i32 1 + // CHECK-NEXT: call void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[V]], i32 noundef 5) + // CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 1 // CHECK-NEXT: invoke void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[Y]], i32 noundef 6) - // CHECK: [[Z:%.*]] = getelementptr inbounds [[B]], ptr [[V]], i32 0, i32 2 + // CHECK: [[Z:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 2 // CHECK-NEXT: invoke void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[Z]], i32 noundef 7) - // CHECK: [[W:%.*]] = getelementptr inbounds [[B]], ptr [[V]], i32 0, i32 3 + // CHECK: [[W:%.*]] = getelementptr inbounds [[B]], ptr [[V]], i32 0, i32 1 // CHECK-NEXT: store i32 8, ptr [[W]], align 4 // CHECK-NEXT: call void @_ZN5test11BD1Ev(ptr {{[^,]*}} [[V]]) // CHECK-NEXT: ret void @@ -124,9 +123,9 @@ namespace test1 { // CHECK: landingpad { ptr, i32 } // CHECK-NEXT: cleanup // CHECKv03: invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[Y]]) - // CHECKv03: invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[X]]) + // CHECKv03: invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[V]]) // CHECKv11: call void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[Y]]) - // CHECKv11: call void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[X]]) + // CHECKv11: call void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[V]]) } namespace test2 { diff --git a/clang/test/CodeGenCXX/pod-member-memcpys.cpp b/clang/test/CodeGenCXX/pod-member-memcpys.cpp index 16d3d45a8179b7..8efec6184a3daf 100644 --- a/clang/test/CodeGenCXX/pod-member-memcpys.cpp +++ b/clang/test/CodeGenCXX/pod-member-memcpys.cpp @@ -1,6 +1,8 @@ // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -emit-llvm -std=c++03 -fexceptions -fcxx-exceptions -o - %s | FileCheck %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-apple-darwin10 -emit-llvm -std=c++03 -o - %s | FileCheck --check-prefix=CHECK-2 %s +struct Empty {}; + struct POD { int w, x, y, z; }; @@ -106,6 +108,20 @@ struct __attribute__((packed)) PackedMembers { int w, x, y, z; }; +struct WithEmptyField { + int a; + Empty e; + NonPOD np; + int b; +}; + +struct WithEmptyNUAField { + int a; + [[no_unique_address]] Empty e; + NonPOD np; + int b; +}; + // COPY-ASSIGNMENT OPERATORS: // Assignment operators are output in the order they're encountered. @@ -121,6 +137,8 @@ CALL_AO(VolatileMember) CALL_AO(BitfieldMember) CALL_AO(InnerClassMember) CALL_AO(PackedMembers) +CALL_AO(WithEmptyField) +CALL_AO(WithEmptyNUAField) // Basic copy-assignment: // CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN5BasicaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) @@ -185,6 +203,18 @@ CALL_AO(PackedMembers) // CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}}) // CHECK: ret ptr +// WithEmptyField copy-assignment: +// CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14WithEmptyFieldaSERKS_ +// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}}) +// CHECK: call nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN6NonPODaSERKS_ +// CHECK: ret ptr + +// WithEmptyNUAField copy-assignment: +// CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN17WithEmptyNUAFieldaSERKS_ +// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}}) +// CHECK: call nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN6NonPODaSERKS_ +// CHECK: ret ptr + // COPY-CONSTRUCTORS: // Clang outputs copy-constructors in the reverse of the order that @@ -280,3 +310,15 @@ CALL_CC(Basic) // CHECK: call void @_ZN6NonPODC1ERKS_ // CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}}) // CHECK: ret void + +CALL_CC(WithEmptyField) +// WithEmptyField copy-constructor: +// CHECK-LABEL: define linkonce_odr void @_ZN14WithEmptyFieldC2ERKS_ +// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}}) +// CHECK: call void @_ZN6NonPODC1ERKS_ + +CALL_CC(WithEmptyNUAField) +// WithEmptyNUAField copy-constructor: +// CHECK-LABEL: define linkonce_odr void @_ZN17WithEmptyNUAFieldC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}}) +// CHECK: call void @_ZN6NonPODC1ERKS_ diff --git a/clang/test/CodeGenCXX/pr18962.cpp b/clang/test/CodeGenCXX/pr18962.cpp index b564a7b9a73afd..9ac87003c94c57 100644 --- a/clang/test/CodeGenCXX/pr18962.cpp +++ b/clang/test/CodeGenCXX/pr18962.cpp @@ -23,7 +23,6 @@ D p3; // We end up using an opaque type for 'append' to avoid circular references. // CHECK: %class.A = type { ptr } -// CHECK: %class.C = type <{ ptr, %class.B, [3 x i8] }> -// CHECK: %class.B = type { i8 } +// CHECK: %class.C = type <{ ptr, [4 x i8] }> // CHECK: %class.D = type { %class.C.base, [3 x i8] } -// CHECK: %class.C.base = type <{ ptr, %class.B }> +// CHECK: %class.C.base = type <{ ptr, i8 }> diff --git a/clang/test/CodeGenCXX/references.cpp b/clang/test/CodeGenCXX/references.cpp index 0fca5e76659c2f..b84cb788d161c3 100644 --- a/clang/test/CodeGenCXX/references.cpp +++ b/clang/test/CodeGenCXX/references.cpp @@ -191,7 +191,6 @@ namespace N2 { // CHECK-LABEL: define{{.*}} void @_ZN2N21fEi // CHECK: call void @_ZN2N24getPEv - // CHECK: getelementptr inbounds // CHECK: store i32 17 // CHECK: call void @_ZN2N21PD1Ev void f(int i) { @@ -220,8 +219,7 @@ namespace N2 { // CHECK-LABEL: define{{.*}} void @_ZN2N21gEi // CHECK: call void @_ZN2N24getZEv - // CHECK: {{getelementptr inbounds.*i32 0, i32 0}} - // CHECK: {{getelementptr inbounds.*i32 0, i32 0}} + // CHECK: {{getelementptr inbounds.*i64 16}} // CHECK: store i32 19 // CHECK: call void @_ZN2N21ZD1Ev // CHECK: ret void diff --git a/clang/test/CodeGenCXX/temporaries.cpp b/clang/test/CodeGenCXX/temporaries.cpp index f992ce206c5815..9f697bd9bf3efc 100644 --- a/clang/test/CodeGenCXX/temporaries.cpp +++ b/clang/test/CodeGenCXX/temporaries.cpp @@ -715,7 +715,7 @@ namespace MultipleExtension { // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1AD1Ev, {{.*}} @[[TEMPA]] // CHECK: store {{.*}} @[[TEMPA]], {{.*}} @[[TEMPE:_ZGRN17MultipleExtension2e1E.*]], - // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[TEMPE]], i32 0, i32 1)) + // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[TEMPE]], i64 8)) // CHECK: call void @_ZN17MultipleExtension1DC1Ev({{.*}} @[[TEMPD:_ZGRN17MultipleExtension2e1E.*]]) // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1DD1Ev, {{.*}} @[[TEMPD]] @@ -729,7 +729,7 @@ namespace MultipleExtension { // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1AD1Ev, {{.*}} @[[TEMPA]] // CHECK: store {{.*}} @[[TEMPA]], {{.*}} @[[E:_ZN17MultipleExtension2e2E]] - // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[E]], i32 0, i32 1)) + // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[E]], i64 8)) // CHECK: call void @_ZN17MultipleExtension1DC1Ev({{.*}} @[[TEMPD:_ZGRN17MultipleExtension2e2E.*]]) // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1DD1Ev, {{.*}} @[[TEMPD]] @@ -744,11 +744,11 @@ namespace MultipleExtension { // CHECK: %[[TEMPE1_A:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1:.*]], i32 0, i32 0 // CHECK: call void @[[NS]]1AC1Ev({{.*}} %[[TEMPA1:.*]]) // CHECK: store {{.*}} %[[TEMPA1]], {{.*}} %[[TEMPE1_A]] - // CHECK: %[[TEMPE1_B:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i32 0, i32 1 + // CHECK: %[[TEMPE1_B:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i64 8 // CHECK: call void @[[NS]]1BC1Ev({{.*}} %[[TEMPE1_B]]) // CHECK: %[[TEMPE1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i32 0, i32 2 // CHECK: call void @[[NS]]1DC1Ev({{.*}} %[[TEMPD1:.*]]) - // CHECK: %[[TEMPD1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD1]], i32 0, i32 1 + // CHECK: %[[TEMPD1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD1]], i64 4 // CHECK: store {{.*}} %[[TEMPD1_C]], {{.*}} %[[TEMPE1_C]] // CHECK: store {{.*}} %[[TEMPE1]], {{.*}} %[[E1:.*]] @@ -759,11 +759,11 @@ namespace MultipleExtension { // CHECK: %[[TEMPE2_A:.*]] = getelementptr inbounds {{.*}} %[[E2:.*]], i32 0, i32 0 // CHECK: call void @[[NS]]1AC1Ev({{.*}} %[[TEMPA2:.*]]) // CHECK: store {{.*}} %[[TEMPA2]], {{.*}} %[[TEMPE2_A]] - // CHECK: %[[TEMPE2_B:.*]] = getelementptr inbounds {{.*}} %[[E2]], i32 0, i32 1 + // CHECK: %[[TEMPE2_B:.*]] = getelementptr inbounds {{.*}} %[[E2]], i64 8 // CHECK: call void @[[NS]]1BC1Ev({{.*}} %[[TEMPE2_B]]) // CHECK: %[[TEMPE2_C:.*]] = getelementptr inbounds {{.*}} %[[E2]], i32 0, i32 2 // CHECK: call void @[[NS]]1DC1Ev({{.*}} %[[TEMPD2:.*]]) - // CHECK: %[[TEMPD2_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD2]], i32 0, i32 1 + // CHECK: %[[TEMPD2_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD2]], i64 4 // CHECK: store {{.*}} %[[TEMPD2_C]], ptr %[[TEMPE2_C]] g(); diff --git a/clang/test/CodeGenCXX/zero-init-empty-virtual.cpp b/clang/test/CodeGenCXX/zero-init-empty-virtual.cpp new file mode 100644 index 00000000000000..b2823acdfa461c --- /dev/null +++ b/clang/test/CodeGenCXX/zero-init-empty-virtual.cpp @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK + +struct polymorphic_base { + virtual void func() {} + virtual ~polymorphic_base() {} +}; + +struct Empty {}; +struct derived_virtual : virtual Empty {}; +struct derived : polymorphic_base {}; + +// CHECK: %struct.Holder1 = type { %struct.polymorphic_base } +// CHECK: %struct.polymorphic_base = type { ptr } +// CHECK: %struct.Holder2 = type { %struct.derived_virtual } +// CHECK: %struct.derived_virtual = type { ptr } +// CHECK: %struct.Holder3 = type { %struct.derived } +// CHECK: %struct.derived = type { %struct.polymorphic_base } + +struct Holder1 { + polymorphic_base a{}; +} g_holder1; + +// CHECK: @{{.*}} = {{.*}}global %struct.Holder1 { %struct.polymorphic_base { ptr {{.*}} } } + +struct Holder2 { + derived_virtual a{}; +} g_holder2; + +// CHECK: @{{.*}} = {{.*}}global %struct.Holder2 zeroinitializer + +struct Holder3 { + derived a{}; +} g_holder3; + +// CHECK: @{{.*}} = {{.*}}global { { ptr } } { { ptr } { ptr {{.*}} } } diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip index 107ef6b94c4de6..1b53ebec9b5821 100644 --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -2,9 +2,6 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ // RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPTNONE %s -// RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ -// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s - #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -13,53 +10,36 @@ // OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" // OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 //. -// OPT: @__hip_cuid_ = addrspace(1) global i8 0 -// OPT: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 -// OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" -//. +__device__ void extern_func(); + // OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv // OPTNONE-SAME: () #[[ATTR0:[0-9]+]] { // OPTNONE-NEXT: entry: +// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]] // OPTNONE-NEXT: ret void // -// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -// OPT-LABEL: define {{[^@]+}}@_Z4funcv -// OPT-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] { -// OPT-NEXT: entry: -// OPT-NEXT: ret void -// __device__ void func() { - + extern_func(); } // OPTNONE: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z6kernelv -// OPTNONE-SAME: () #[[ATTR1:[0-9]+]] { +// OPTNONE-SAME: () #[[ATTR2:[0-9]+]] { // OPTNONE-NEXT: entry: +// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3]] // OPTNONE-NEXT: ret void // -// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -// OPT-LABEL: define {{[^@]+}}@_Z6kernelv -// OPT-SAME: () local_unnamed_addr #[[ATTR1:[0-9]+]] { -// OPT-NEXT: entry: -// OPT-NEXT: ret void -// __global__ void kernel() { - + extern_func(); } //. // OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -//. -// OPT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// OPT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind } //. // OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} // OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} // OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -//. diff --git a/clang/test/CodeGenHIP/printf_nonhostcall.cpp b/clang/test/CodeGenHIP/printf_nonhostcall.cpp index 34904819ae072d..2c6d0ecac1e8a9 100644 --- a/clang/test/CodeGenHIP/printf_nonhostcall.cpp +++ b/clang/test/CodeGenHIP/printf_nonhostcall.cpp @@ -267,8 +267,10 @@ __device__ _BitInt(128) Int128 = 45637; // CHECK-NEXT: [[TMP4:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @f2 to ptr), align 8 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr addrspacecast (ptr addrspace(1) @f3 to ptr), align 2 // CHECK-NEXT: [[TMP6:%.*]] = load bfloat, ptr addrspacecast (ptr addrspace(1) @f4 to ptr), align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load i55, ptr addrspacecast (ptr addrspace(1) @Int55 to ptr), align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i44, ptr addrspacecast (ptr addrspace(1) @Int44 to ptr), align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @Int55 to ptr), align 8 +// CHECK-NEXT: [[LOADEDV:%.*]] = trunc i64 [[TMP7]] to i55 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @Int44 to ptr), align 8 +// CHECK-NEXT: [[LOADEDV2:%.*]] = trunc i64 [[TMP8]] to i44 // CHECK-NEXT: [[TMP9:%.*]] = load i128, ptr addrspacecast (ptr addrspace(1) @Int128 to ptr), align 8 // CHECK-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 108) // CHECK-NEXT: [[TMP10:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null @@ -286,30 +288,30 @@ __device__ _BitInt(128) Int128 = 45637; // CHECK-NEXT: store i64 [[TMP14]], ptr addrspace(1) [[TMP13]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i32 8 // CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 -// CHECK-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 // CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[CONV]] to i64 -// CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 -// CHECK-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 -// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 +// CHECK-NEXT: store i64 [[TMP15]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8 -// CHECK-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8 -// CHECK-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 +// CHECK-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8 -// CHECK-NEXT: [[TMP16:%.*]] = fpext half [[TMP5]] to double -// CHECK-NEXT: store double [[TMP16]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 +// CHECK-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8 -// CHECK-NEXT: [[TMP17:%.*]] = fpext bfloat [[TMP6]] to double -// CHECK-NEXT: store double [[TMP17]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = fpext half [[TMP5]] to double +// CHECK-NEXT: store double [[TMP16]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8 -// CHECK-NEXT: [[TMP18:%.*]] = zext i55 [[TMP7]] to i64 -// CHECK-NEXT: store i64 [[TMP18]], ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = fpext bfloat [[TMP6]] to double +// CHECK-NEXT: store double [[TMP17]], ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8 -// CHECK-NEXT: [[TMP19:%.*]] = zext i44 [[TMP8]] to i64 -// CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = zext i55 [[LOADEDV]] to i64 +// CHECK-NEXT: store i64 [[TMP18]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 8 // CHECK-NEXT: [[PRINTBUFFNEXTPTR10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], i32 8 -// CHECK-NEXT: store i128 [[TMP9]], ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], align 8 -// CHECK-NEXT: [[PRINTBUFFNEXTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], i32 16 +// CHECK-NEXT: [[TMP19:%.*]] = zext i44 [[LOADEDV2]] to i64 +// CHECK-NEXT: store i64 [[TMP19]], ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], i32 8 +// CHECK-NEXT: store i128 [[TMP9]], ptr addrspace(1) [[PRINTBUFFNEXTPTR11]], align 8 +// CHECK-NEXT: [[PRINTBUFFNEXTPTR12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR11]], i32 16 // CHECK-NEXT: br label [[END_BLOCK]] // // CHECK_CONSTRAINED-LABEL: define dso_local noundef i32 @_Z4foo3v @@ -326,8 +328,10 @@ __device__ _BitInt(128) Int128 = 45637; // CHECK_CONSTRAINED-NEXT: [[TMP4:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @f2 to ptr), align 8 // CHECK_CONSTRAINED-NEXT: [[TMP5:%.*]] = load half, ptr addrspacecast (ptr addrspace(1) @f3 to ptr), align 2 // CHECK_CONSTRAINED-NEXT: [[TMP6:%.*]] = load bfloat, ptr addrspacecast (ptr addrspace(1) @f4 to ptr), align 2 -// CHECK_CONSTRAINED-NEXT: [[TMP7:%.*]] = load i55, ptr addrspacecast (ptr addrspace(1) @Int55 to ptr), align 8 -// CHECK_CONSTRAINED-NEXT: [[TMP8:%.*]] = load i44, ptr addrspacecast (ptr addrspace(1) @Int44 to ptr), align 8 +// CHECK_CONSTRAINED-NEXT: [[TMP7:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @Int55 to ptr), align 8 +// CHECK_CONSTRAINED-NEXT: [[LOADEDV:%.*]] = trunc i64 [[TMP7]] to i55 +// CHECK_CONSTRAINED-NEXT: [[TMP8:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @Int44 to ptr), align 8 +// CHECK_CONSTRAINED-NEXT: [[LOADEDV2:%.*]] = trunc i64 [[TMP8]] to i44 // CHECK_CONSTRAINED-NEXT: [[TMP9:%.*]] = load i128, ptr addrspacecast (ptr addrspace(1) @Int128 to ptr), align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 108) // CHECK_CONSTRAINED-NEXT: [[TMP10:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null @@ -345,30 +349,30 @@ __device__ _BitInt(128) Int128 = 45637; // CHECK_CONSTRAINED-NEXT: store i64 [[TMP14]], ptr addrspace(1) [[TMP13]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i32 8 // CHECK_CONSTRAINED-NEXT: store ptr addrspacecast (ptr addrspace(3) @_ZZ4foo3vE1s to ptr), ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 -// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 // CHECK_CONSTRAINED-NEXT: [[TMP15:%.*]] = zext i32 [[CONV]] to i64 -// CHECK_CONSTRAINED-NEXT: store i64 [[TMP15]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8 -// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8 -// CHECK_CONSTRAINED-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 +// CHECK_CONSTRAINED-NEXT: store i64 [[TMP15]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8 -// CHECK_CONSTRAINED-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 +// CHECK_CONSTRAINED-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 8 -// CHECK_CONSTRAINED-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 +// CHECK_CONSTRAINED-NEXT: store double [[CONV1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8 -// CHECK_CONSTRAINED-NEXT: [[TMP16:%.*]] = fpext half [[TMP5]] to double -// CHECK_CONSTRAINED-NEXT: store double [[TMP16]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 +// CHECK_CONSTRAINED-NEXT: store double [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8 -// CHECK_CONSTRAINED-NEXT: [[TMP17:%.*]] = fpext bfloat [[TMP6]] to double -// CHECK_CONSTRAINED-NEXT: store double [[TMP17]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 +// CHECK_CONSTRAINED-NEXT: [[TMP16:%.*]] = fpext half [[TMP5]] to double +// CHECK_CONSTRAINED-NEXT: store double [[TMP16]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8 -// CHECK_CONSTRAINED-NEXT: [[TMP18:%.*]] = zext i55 [[TMP7]] to i64 -// CHECK_CONSTRAINED-NEXT: store i64 [[TMP18]], ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8 +// CHECK_CONSTRAINED-NEXT: [[TMP17:%.*]] = fpext bfloat [[TMP6]] to double +// CHECK_CONSTRAINED-NEXT: store double [[TMP17]], ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8 -// CHECK_CONSTRAINED-NEXT: [[TMP19:%.*]] = zext i44 [[TMP8]] to i64 -// CHECK_CONSTRAINED-NEXT: store i64 [[TMP19]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 8 +// CHECK_CONSTRAINED-NEXT: [[TMP18:%.*]] = zext i55 [[LOADEDV]] to i64 +// CHECK_CONSTRAINED-NEXT: store i64 [[TMP18]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 8 // CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], i32 8 -// CHECK_CONSTRAINED-NEXT: store i128 [[TMP9]], ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], align 8 -// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], i32 16 +// CHECK_CONSTRAINED-NEXT: [[TMP19:%.*]] = zext i44 [[LOADEDV2]] to i64 +// CHECK_CONSTRAINED-NEXT: store i64 [[TMP19]], ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], align 8 +// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR10]], i32 8 +// CHECK_CONSTRAINED-NEXT: store i128 [[TMP9]], ptr addrspace(1) [[PRINTBUFFNEXTPTR11]], align 8 +// CHECK_CONSTRAINED-NEXT: [[PRINTBUFFNEXTPTR12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR11]], i32 16 // CHECK_CONSTRAINED-NEXT: br label [[END_BLOCK]] // __device__ int foo3() { diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl index 9881dabc3a1106..b0b95074c972d5 100644 --- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s // CHECK-LABEL: builtin_bool_to_float_type_promotion -// CHECK: %conv1 = uitofp i1 %tobool to double +// CHECK: %conv1 = uitofp i1 %loadedv to double // CHECK: %dx.dot = fmul double %conv, %conv1 // CHECK: %conv2 = fptrunc double %dx.dot to float // CHECK: ret float %conv2 @@ -10,7 +10,7 @@ float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) { } // CHECK-LABEL: builtin_bool_to_float_arg1_type_promotion -// CHECK: %conv = uitofp i1 %tobool to double +// CHECK: %conv = uitofp i1 %loadedv to double // CHECK: %conv1 = fpext float %1 to double // CHECK: %dx.dot = fmul double %conv, %conv1 // CHECK: %conv2 = fptrunc double %dx.dot to float diff --git a/clang/test/CodeGenObjCXX/lambda-to-block.mm b/clang/test/CodeGenObjCXX/lambda-to-block.mm index e3ce7104d97bf0..86b540ed52e9ea 100644 --- a/clang/test/CodeGenObjCXX/lambda-to-block.mm +++ b/clang/test/CodeGenObjCXX/lambda-to-block.mm @@ -2,11 +2,10 @@ // Shouldn't crash! -// CHECK: %[[CLASS_ANON:.*]] = type { %[[STRUCT_COPYABLE:.*]] } -// CHECK: %[[STRUCT_COPYABLE]] = type { i8 } -// CHECK: %[[CLASS_ANON_0:.*]] = type { %[[STRUCT_COPYABLE]] } -// CHECK: %[[CLASS_ANON_1:.*]] = type { %[[STRUCT_COPYABLE]] } -// CHECK: %[[CLASS_ANON_2:.*]] = type { %[[STRUCT_COPYABLE]] } +// CHECK: %[[CLASS_ANON:.*]] = type { i8 } +// CHECK: %[[CLASS_ANON_0:.*]] = type { i8 } +// CHECK: %[[CLASS_ANON_1:.*]] = type { i8 } +// CHECK: %[[CLASS_ANON_2:.*]] = type { i8 } // CHECK: @[[BLOCK_DESC0:.*]] = internal constant { i64, i64, ptr, ptr, ptr, ptr } { i64 0, i64 33, ptr @[[COPY_HELPER0:.*__copy_helper_block_.*]], ptr @__destroy_helper_block{{.*}}, {{.*}}}, align 8 // CHECK: @[[BLOCK_DESC1:.*]] = internal constant { i64, i64, ptr, ptr, ptr, ptr } { i64 0, i64 33, ptr @[[COPY_HELPER1:.*__copy_helper_block_.*]], ptr @__destroy_helper_block{{.*}}, {{.*}}}, align 8 diff --git a/clang/test/Driver/msse2avx.c b/clang/test/Driver/msse2avx.c new file mode 100644 index 00000000000000..a63ac9a6c86681 --- /dev/null +++ b/clang/test/Driver/msse2avx.c @@ -0,0 +1,7 @@ +// RUN: %clang -### -c -target x86_64 -march=x86-64 -Xassembler -msse2avx %s 2>&1 | FileCheck %s +// RUN: %clang -### -c -target x86_64 -march=x86-64 -x assembler -Xassembler -msse2avx %s 2>&1 | FileCheck %s + +// CHECK: "-msse2avx" + +// RUN: not %clang -### -c -target aarch64 -march=armv8a -msse2avx %s 2>&1 | FileCheck --check-prefix=ERR %s +// ERR: error: unsupported option '-msse2avx' for target 'aarch64' diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 875472202d2429..88cbcc1296244f 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -169,8 +169,8 @@ // CHECK-NEXT: xwchc 2.2 'Xwchc' (WCH/QingKe additional compressed opcodes) // CHECK-EMPTY: // CHECK-NEXT: Experimental extensions -// CHECK-NEXT: zicfilp 0.4 'Zicfilp' (Landing pad) -// CHECK-NEXT: zicfiss 0.4 'Zicfiss' (Shadow stack) +// CHECK-NEXT: zicfilp 1.0 'Zicfilp' (Landing pad) +// CHECK-NEXT: zicfiss 1.0 'Zicfiss' (Shadow stack) // CHECK-NEXT: zalasr 0.1 'Zalasr' (Load-Acquire and Store-Release Instructions) // CHECK-NEXT: smmpm 1.0 'Smmpm' (Machine-level Pointer Masking for M-mode) // CHECK-NEXT: smnpm 1.0 'Smnpm' (Machine-level Pointer Masking for next lower privilege mode) diff --git a/clang/test/Driver/ps4-linker.c b/clang/test/Driver/ps4-linker.c new file mode 100644 index 00000000000000..be0103bffe8136 --- /dev/null +++ b/clang/test/Driver/ps4-linker.c @@ -0,0 +1,20 @@ +// Test the driver's control over the JustMyCode behavior with linker flags. + +// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s +// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-THIN-LTO,CHECK-LIB %s +// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-FULL-LTO,CHECK-LIB %s + +// CHECK-NOT: -enable-jmc-instrument +// CHECK-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -enable-jmc-instrument" +// CHECK-FULL-LTO: "-lto-debug-options= -generate-arange-section -enable-jmc-instrument" + +// Check the default library name. +// CHECK-LIB: "--whole-archive" "-lSceDbgJmc" "--no-whole-archive" + +// Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags. + +// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-THIN-LTO %s +// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-FULL-LTO %s + +// CHECK-DIAG-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps" +// CHECK-DIAG-FULL-LTO: "-lto-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps" diff --git a/clang/test/Driver/ps4-ps5-linker.c b/clang/test/Driver/ps4-ps5-linker.c deleted file mode 100644 index 8aae94c8388346..00000000000000 --- a/clang/test/Driver/ps4-ps5-linker.c +++ /dev/null @@ -1,29 +0,0 @@ -// Test the driver's control over the JustMyCode behavior with linker flags. - -// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PS4,CHECK-PS4-LIB %s -// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PS4-THIN-LTO,CHECK-PS4-LIB %s -// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PS4-FULL-LTO,CHECK-PS4-LIB %s -// RUN: %clang --target=x86_64-scei-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PS5,CHECK-PS5-LIB %s -// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PS5-LTO,CHECK-PS5-LIB %s - -// CHECK-PS4-NOT: -enable-jmc-instrument -// CHECK-PS4-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -enable-jmc-instrument" -// CHECK-PS4-FULL-LTO: "-lto-debug-options= -generate-arange-section -enable-jmc-instrument" -// CHECK-PS5-NOT: -plugin-opt=-enable-jmc-instrument -// CHECK-PS5-LTO: -plugin-opt=-enable-jmc-instrument - -// Check the default library name. -// CHECK-PS4-LIB: "--whole-archive" "-lSceDbgJmc" "--no-whole-archive" -// CHECK-PS5-LIB: "--whole-archive" "-lSceJmc_nosubmission" "--no-whole-archive" - -// Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags. - -// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-PS4-THIN-LTO %s -// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-PS4-FULL-LTO %s -// RUN: %clang --target=x86_64-scei-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-PS5 %s -// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-PS5-LTO %s - -// CHECK-DIAG-PS4-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps" -// CHECK-DIAG-PS4-FULL-LTO: "-lto-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps" -// CHECK-DIAG-PS5-NOT: -plugin-opt=-crash-diagnostics-dir=mydumps -// CHECK-DIAG-PS5-LTO: -plugin-opt=-crash-diagnostics-dir=mydumps diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c new file mode 100644 index 00000000000000..9f1e3a273b2db2 --- /dev/null +++ b/clang/test/Driver/ps5-linker.c @@ -0,0 +1,18 @@ +// Test the driver's control over the JustMyCode behavior with linker flags. + +// RUN: %clang --target=x86_64-scei-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s +// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s + +// CHECK-NOT: -plugin-opt=-enable-jmc-instrument +// CHECK-LTO: -plugin-opt=-enable-jmc-instrument + +// Check the default library name. +// CHECK-LIB: "--whole-archive" "-lSceJmc_nosubmission" "--no-whole-archive" + +// Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags. + +// RUN: %clang --target=x86_64-scei-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s +// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s + +// CHECK-DIAG-NOT: -plugin-opt=-crash-diagnostics-dir=mydumps +// CHECK-DIAG-LTO: -plugin-opt=-crash-diagnostics-dir=mydumps diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c index cfe293cd4667ff..b4fad5177c5f76 100644 --- a/clang/test/Driver/riscv-features.c +++ b/clang/test/Driver/riscv-features.c @@ -1,8 +1,8 @@ // RUN: %clang --target=riscv32-unknown-elf -### %s -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -### %s -fsyntax-only 2>&1 | FileCheck %s -// RUN: %clang --target=riscv64-linux-android -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=ANDROID,DEFAULT,FAST-UNALIGNED-ACCESS -// RUN: %clang -mabi=lp64d --target=riscv64-linux-android -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=ANDROID,DEFAULT,FAST-UNALIGNED-ACCESS -// RUN: %clang -mabi=lp64d --target=riscv64-linux-android -mstrict-align -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=NO-FAST-UNALIGNED-ACCESS +// RUN: %clang --target=riscv64-linux-android -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=ANDROID,DEFAULT,FAST-SCALAR-UNALIGNED-ACCESS,FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang -mabi=lp64d --target=riscv64-linux-android -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=ANDROID,DEFAULT,FAST-SCALAR-UNALIGNED-ACCESS,FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang -mabi=lp64d --target=riscv64-linux-android -mstrict-align -mvector-strict-align -### %s -fsyntax-only 2>&1 | FileCheck %s -check-prefixes=NO-FAST-SCALAR-UNALIGNED-ACCESS,NO-FAST-VECTOR-UNALIGNED-ACCESS // CHECK: fno-signed-char @@ -35,13 +35,23 @@ // NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack" // DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack" -// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS -// RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefixes=FAST-SCALAR-UNALIGNED-ACCESS,FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefixes=NO-FAST-SCALAR-UNALIGNED-ACCESS,NO-FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-scalar-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-SCALAR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mscalar-strict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-SCALAR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-scalar-strict-align -mstrict-align 2>&1 | FileCheck %s -check-prefixes=NO-FAST-SCALAR-UNALIGNED-ACCESS,NO-FAST-VECTOR-UNALIGNED-ACCESS // RUN: touch %t.o // RUN: %clang --target=riscv32-unknown-elf -### %t.o -mno-strict-align -mstrict-align -// FAST-UNALIGNED-ACCESS: "-target-feature" "+unaligned-scalar-mem" "-target-feature" "+unaligned-vector-mem" -// NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-unaligned-scalar-mem" "-target-feature" "-unaligned-vector-mem" +// FAST-SCALAR-UNALIGNED-ACCESS: "-target-feature" "+unaligned-scalar-mem" +// NO-FAST-SCALAR-UNALIGNED-ACCESS: "-target-feature" "-unaligned-scalar-mem" + +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-vector-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mvector-strict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-vector-strict-align -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-VECTOR-UNALIGNED-ACCESS +// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align -mvector-strict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-VECTOR-UNALIGNED-ACCESS +// FAST-VECTOR-UNALIGNED-ACCESS: "-target-feature" "+unaligned-vector-mem" +// NO-FAST-VECTOR-UNALIGNED-ACCESS: "-target-feature" "-unaligned-vector-mem" // RUN: %clang --target=riscv32-unknown-elf -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE // RUN: %clang --target=riscv32-unknown-elf -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 6773571556bd42..ba61457102a57d 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -8,8 +8,13 @@ // RUN: %clang --target=i386 -march=i386 -mmmx -m3dnow -m3dnowa %s -### 2>&1 | FileCheck -check-prefix=MMX %s // RUN: %clang --target=i386 -march=i386 -mno-mmx -mno-3dnow -mno-3dnowa %s -### 2>&1 | FileCheck -check-prefix=NO-MMX %s -// MMX: "-target-feature" "+mmx" "-target-feature" "+3dnow" "-target-feature" "+3dnowa" -// NO-MMX: "-target-feature" "-mmx" "-target-feature" "-3dnow" "-target-feature" "-3dnowa" +// MMX: warning: the clang compiler does not support '-m3dnowa' +// MMX: warning: the clang compiler does not support '-m3dnow' +// MMX-NOT: "3dnow" +// MMX: "-target-feature" "+mmx" +// MMX-NOT: "3dnow" +// NO-MMX-NOT: warning +// NO-MMX: "-target-feature" "-mmx" // RUN: %clang --target=i386 -march=i386 -msse -msse2 -msse3 -mssse3 -msse4a -msse4.1 -msse4.2 %s -### 2>&1 | FileCheck -check-prefix=SSE %s // RUN: %clang --target=i386 -march=i386 -mno-sse -mno-sse2 -mno-sse3 -mno-ssse3 -mno-sse4a -mno-sse4.1 -mno-sse4.2 %s -### 2>&1 | FileCheck -check-prefix=NO-SSE %s diff --git a/clang/test/FixIt/fixit.cpp b/clang/test/FixIt/fixit.cpp index 144eefb3ae4bd0..605c2d0bd02355 100644 --- a/clang/test/FixIt/fixit.cpp +++ b/clang/test/FixIt/fixit.cpp @@ -158,12 +158,12 @@ class F1 { template class F2 { - typename F1:: /*template*/ Iterator<0> Mypos; // expected-warning {{use 'template' keyword to treat 'Iterator' as a dependent template name}} + typename F1:: /*template*/ Iterator<0> Mypos; // expected-error {{use 'template' keyword to treat 'Iterator' as a dependent template name}} }; template void f(){ - typename F1:: /*template*/ Iterator<0> Mypos; // expected-warning {{use 'template' keyword to treat 'Iterator' as a dependent template name}} + typename F1:: /*template*/ Iterator<0> Mypos; // expected-error {{use 'template' keyword to treat 'Iterator' as a dependent template name}} } // Tests for &/* fixits diff --git a/clang/test/Frontend/fixed_point_comparisons.c b/clang/test/Frontend/fixed_point_comparisons.c index 8cd2aa2dbc651f..59c4405e41c031 100644 --- a/clang/test/Frontend/fixed_point_comparisons.c +++ b/clang/test/Frontend/fixed_point_comparisons.c @@ -249,8 +249,8 @@ void TestIntComparisons(void) { sa == b; // CHECK: [[A:%[0-9]+]] = load i16, ptr %sa, align 2 // CHECK-NEXT: [[B:%[0-9]+]] = load i8, ptr %b, align 1 - // CHECK-NEXT: %tobool = trunc i8 [[B]] to i1 - // CHECK-NEXT: [[CONV_B:%[a-z0-9]+]] = zext i1 %tobool to i32 + // CHECK-NEXT: %loadedv = trunc i8 [[B]] to i1 + // CHECK-NEXT: [[CONV_B:%[a-z0-9]+]] = zext i1 %loadedv to i32 // CHECK-NEXT: [[RESIZE_A:%[a-z0-9]+]] = sext i16 [[A]] to i39 // CHECK-NEXT: [[RESIZE_B:%[a-z0-9]+]] = sext i32 [[CONV_B]] to i39 // CHECK-NEXT: [[UPSCALE_B:%[a-z0-9]+]] = shl i39 [[RESIZE_B]], 7 diff --git a/clang/test/Headers/mm3dnow.c b/clang/test/Headers/mm3dnow.c index 255483cb9b8364..a9b6dd88f8034a 100644 --- a/clang/test/Headers/mm3dnow.c +++ b/clang/test/Headers/mm3dnow.c @@ -1,16 +1,21 @@ // RUN: %clang_cc1 -fsyntax-only -ffreestanding %s -verify +// RUN: %clang_cc1 -fsyntax-only -D_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS -ffreestanding %s -verify // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify -// expected-no-diagnostics #if defined(i386) || defined(__x86_64__) +#ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS +// expected-warning@mm3dnow.h:*{{The header is deprecated}} +#else +// expected-no-diagnostics +#endif + #include -int __attribute__((__target__(("3dnow")))) foo(int a) { - _m_femms(); +int foo(void *x) { + _m_prefetch(x); + _m_prefetchw(x); return 4; } - -__m64 __attribute__((__target__(("3dnowa")))) bar(__m64 a) { - return _m_pf2iw(a); -} +#else +// expected-no-diagnostics #endif diff --git a/clang/test/Index/binop.cpp b/clang/test/Index/binop.cpp new file mode 100644 index 00000000000000..576fd73cc2abfe --- /dev/null +++ b/clang/test/Index/binop.cpp @@ -0,0 +1,92 @@ +// RUN: c-index-test -test-print-binops %s | FileCheck %s + +struct C { + int m; +}; + +void func(void) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-value" + int a, b; + int C::*p = &C::m; + + C c; + c.*p; + + C *pc; + pc->*p; + + a *b; + a / b; + a % b; + a + b; + a - b; + + a << b; + a >> b; + + a < b; + a > b; + + a <= b; + a >= b; + a == b; + a != b; + + a &b; + a ^ b; + a | b; + + a &&b; + a || b; + + a = b; + + a *= b; + a /= b; + a %= b; + a += b; + a -= b; + + a <<= b; + a >>= b; + + a &= b; + a ^= b; + a |= b; + a, b; +#pragma clang diagnostic pop +} + +// CHECK: BinaryOperator=.* BinOp=.* 1 +// CHECK: BinaryOperator=->* BinOp=->* 2 +// CHECK: BinaryOperator=* BinOp=* 3 +// CHECK: BinaryOperator=/ BinOp=/ 4 +// CHECK: BinaryOperator=% BinOp=% 5 +// CHECK: BinaryOperator=+ BinOp=+ 6 +// CHECK: BinaryOperator=- BinOp=- 7 +// CHECK: BinaryOperator=<< BinOp=<< 8 +// CHECK: BinaryOperator=>> BinOp=>> 9 +// CHECK: BinaryOperator=< BinOp=< 11 +// CHECK: BinaryOperator=> BinOp=> 12 +// CHECK: BinaryOperator=<= BinOp=<= 13 +// CHECK: BinaryOperator=>= BinOp=>= 14 +// CHECK: BinaryOperator=== BinOp=== 15 +// CHECK: BinaryOperator=!= BinOp=!= 16 +// CHECK: BinaryOperator=& BinOp=& 17 +// CHECK: BinaryOperator=^ BinOp=^ 18 +// CHECK: BinaryOperator=| BinOp=| 19 +// CHECK: BinaryOperator=&& BinOp=&& 20 +// CHECK: BinaryOperator=|| BinOp=|| 21 +// CHECK: BinaryOperator== BinOp== 22 +// CHECK: CompoundAssignOperator=*= BinOp=*= 23 +// CHECK: CompoundAssignOperator=/= BinOp=/= 24 +// CHECK: CompoundAssignOperator=%= BinOp=%= 25 +// CHECK: CompoundAssignOperator=+= BinOp=+= 26 +// CHECK: CompoundAssignOperator=-= BinOp=-= 27 +// CHECK: CompoundAssignOperator=<<= BinOp=<<= 28 +// CHECK: CompoundAssignOperator=>>= BinOp=>>= 29 +// CHECK: CompoundAssignOperator=&= BinOp=&= 30 +// CHECK: CompoundAssignOperator=^= BinOp=^= 31 +// CHECK: CompoundAssignOperator=|= BinOp=|= 32 +// CHECK: BinaryOperator=, BinOp=, 33 diff --git a/clang/test/Index/blocks.c b/clang/test/Index/blocks.c index 3f33e48e4ced03..304c7800cb700c 100644 --- a/clang/test/Index/blocks.c +++ b/clang/test/Index/blocks.c @@ -23,7 +23,7 @@ void test() { // CHECK: blocks.c:9:18: TypeRef=struct foo:4:8 Extent=[9:18 - 9:21] // CHECK: blocks.c:9:28: CompoundStmt= Extent=[9:28 - 9:58] // CHECK: blocks.c:9:30: ReturnStmt= Extent=[9:30 - 9:55] -// CHECK: blocks.c:9:37: BinaryOperator= Extent=[9:37 - 9:55] +// CHECK: blocks.c:9:37: BinaryOperator=+ Extent=[9:37 - 9:55] // CHECK: blocks.c:9:37: CStyleCastExpr= Extent=[9:37 - 9:51] // CHECK: blocks.c:9:38: TypeRef=int_t:3:13 Extent=[9:38 - 9:43] // CHECK: blocks.c:9:50: MemberRefExpr=x:4:19 SingleRefName=[9:50 - 9:51] RefName=[9:50 - 9:51] Extent=[9:45 - 9:51] @@ -31,4 +31,3 @@ void test() { // CHECK: blocks.c:9:54: DeclRefExpr=i:8:11 Extent=[9:54 - 9:55] // CHECK: blocks.c:9:59: UnaryOperator= Extent=[9:59 - 9:64] // CHECK: blocks.c:9:60: DeclRefExpr=_foo:7:21 Extent=[9:60 - 9:64] - diff --git a/clang/test/Index/c-index-api-loadTU-test.m b/clang/test/Index/c-index-api-loadTU-test.m index 7aa8f800e037cf..7ec57cf3ab63d8 100644 --- a/clang/test/Index/c-index-api-loadTU-test.m +++ b/clang/test/Index/c-index-api-loadTU-test.m @@ -130,7 +130,7 @@ @interface TestAttributes() // CHECK: c-index-api-loadTU-test.m:50:13: VarDecl=d:50:13 (Definition) Extent=[50:2 - 50:14] // CHECK: c-index-api-loadTU-test.m:50:2: TypeRef=id:0:0 Extent=[50:2 - 50:4] // CHECK: c-index-api-loadTU-test.m:50:6: ObjCProtocolRef=Proto:25:11 Extent=[50:6 - 50:11] -// CHECK: c-index-api-loadTU-test.m:51:2: BinaryOperator= Extent=[51:2 - 51:7] +// CHECK: c-index-api-loadTU-test.m:51:2: BinaryOperator== Extent=[51:2 - 51:7] // CHECK: c-index-api-loadTU-test.m:51:2: DeclRefExpr=d:50:13 Extent=[51:2 - 51:3] // CHECK: c-index-api-loadTU-test.m:51:6: UnexposedExpr=c:49:12 Extent=[51:6 - 51:7] // CHECK: c-index-api-loadTU-test.m:51:6: UnexposedExpr=c:49:12 Extent=[51:6 - 51:7] diff --git a/clang/test/Index/index-concepts.cpp b/clang/test/Index/index-concepts.cpp index d29b9dcf79522b..17fbd02f34fe8f 100644 --- a/clang/test/Index/index-concepts.cpp +++ b/clang/test/Index/index-concepts.cpp @@ -41,10 +41,10 @@ template concept ConWithLogicalAnd = Con1 && sizeof(T) > sizeFunc(); // CHECK: index-concepts.cpp:[[@LINE-1]]:9: ConceptDecl=ConWithLogicalAnd:[[@LINE-1]]:9 (Definition) Extent=[[[@LINE-2]]:1 - [[@LINE-1]]:62] // CHECK: index-concepts.cpp:[[@LINE-3]]:17: TemplateTypeParameter=T:[[@LINE-3]]:17 (Definition) Extent=[[[@LINE-3]]:11 - [[@LINE-3]]:18] [access=public] -// CHECK: index-concepts.cpp:[[@LINE-3]]:29: BinaryOperator= Extent=[[[@LINE-3]]:29 - [[@LINE-3]]:62] +// CHECK: index-concepts.cpp:[[@LINE-3]]:29: BinaryOperator=&& Extent=[[[@LINE-3]]:29 - [[@LINE-3]]:62] // CHECK: index-concepts.cpp:[[@LINE-4]]:29: ConceptSpecializationExpr= Extent=[[[@LINE-4]]:29 - [[@LINE-4]]:36] // CHECK: index-concepts.cpp:[[@LINE-5]]:29: TemplateRef=Con1:31:9 Extent=[[[@LINE-5]]:29 - [[@LINE-5]]:33] -// CHECK: index-concepts.cpp:[[@LINE-6]]:40: BinaryOperator= Extent=[[[@LINE-6]]:40 - [[@LINE-6]]:62] +// CHECK: index-concepts.cpp:[[@LINE-6]]:40: BinaryOperator=> Extent=[[[@LINE-6]]:40 - [[@LINE-6]]:62] // CHECK: index-concepts.cpp:[[@LINE-7]]:40: UnaryExpr= Extent=[[[@LINE-7]]:40 - [[@LINE-7]]:49] // CHECK: index-concepts.cpp:[[@LINE-8]]:47: TypeRef=T:40:17 Extent=[[[@LINE-8]]:47 - [[@LINE-8]]:48] // CHECK: index-concepts.cpp:[[@LINE-9]]:52: UnexposedExpr=sizeFunc:38:15 Extent=[[[@LINE-9]]:52 - [[@LINE-9]]:62] @@ -64,7 +64,7 @@ concept ConTwoTemplateParams = ns::ConInNamespace && ConWithLogicalAnd; // CHECK: index-concepts.cpp:[[@LINE-1]]:9: ConceptDecl=ConTwoTemplateParams:[[@LINE-1]]:9 (Definition) Extent=[[[@LINE-2]]:1 - [[@LINE-1]]:79] // CHECK: index-concepts.cpp:[[@LINE-3]]:17: TemplateTypeParameter=T1:[[@LINE-3]]:17 (Definition) Extent=[[[@LINE-3]]:11 - [[@LINE-3]]:19] [access=public] // CHECK: index-concepts.cpp:[[@LINE-4]]:27: TemplateTypeParameter=T2:[[@LINE-4]]:27 (Definition) Extent=[[[@LINE-4]]:21 - [[@LINE-4]]:29] [access=public] -// CHECK: index-concepts.cpp:[[@LINE-4]]:32: BinaryOperator= Extent=[[[@LINE-4]]:32 - [[@LINE-4]]:79] +// CHECK: index-concepts.cpp:[[@LINE-4]]:32: BinaryOperator=&& Extent=[[[@LINE-4]]:32 - [[@LINE-4]]:79] // CHECK: index-concepts.cpp:[[@LINE-5]]:32: ConceptSpecializationExpr= Extent=[[[@LINE-5]]:32 - [[@LINE-5]]:54] // CHECK: index-concepts.cpp:[[@LINE-6]]:32: NamespaceRef=ns:55:11 Extent=[[[@LINE-6]]:32 - [[@LINE-6]]:34] // CHECK: index-concepts.cpp:[[@LINE-7]]:36: TemplateRef=ConInNamespace:58:9 Extent=[[[@LINE-7]]:36 - [[@LINE-7]]:50] diff --git a/clang/test/Index/load-staticassert.cpp b/clang/test/Index/load-staticassert.cpp index 04e45c2d747146..99f59885eaed52 100644 --- a/clang/test/Index/load-staticassert.cpp +++ b/clang/test/Index/load-staticassert.cpp @@ -3,8 +3,8 @@ static_assert(2 + 2 == 4, "Simple maths"); // RUN: c-index-test -test-load-source all -fno-delayed-template-parsing -std=c++11 %s | FileCheck %s // CHECK: load-staticassert.cpp:2:1: StaticAssert=:2:1 (Definition) Extent=[2:1 - 2:42] -// CHECK: load-staticassert.cpp:2:15: BinaryOperator= Extent=[2:15 - 2:25] -// CHECK: load-staticassert.cpp:2:15: BinaryOperator= Extent=[2:15 - 2:20] +// CHECK: load-staticassert.cpp:2:15: BinaryOperator=== Extent=[2:15 - 2:25] +// CHECK: load-staticassert.cpp:2:15: BinaryOperator=+ Extent=[2:15 - 2:20] // CHECK: load-staticassert.cpp:2:15: IntegerLiteral= Extent=[2:15 - 2:16] // CHECK: load-staticassert.cpp:2:19: IntegerLiteral= Extent=[2:19 - 2:20] // CHECK: load-staticassert.cpp:2:24: IntegerLiteral= Extent=[2:24 - 2:25] diff --git a/clang/test/Index/nested-binaryoperators.cpp b/clang/test/Index/nested-binaryoperators.cpp index 57adc6b54664af..443e565744a496 100644 --- a/clang/test/Index/nested-binaryoperators.cpp +++ b/clang/test/Index/nested-binaryoperators.cpp @@ -169,1815 +169,2328 @@ int foo(uint c) { // CHECK: 3:3: ReturnStmt= Extent=[3:3 - 160:52] // CHECK: 3:10: UnexposedExpr= Extent=[3:10 - 160:52] // CHECK: 3:10: ParenExpr= Extent=[3:10 - 160:52] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 160:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 160:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 159:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 158:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 158:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 157:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 156:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 155:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 154:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 153:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 152:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 152:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 151:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 151:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 150:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 149:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 148:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 147:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 146:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 145:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 144:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 144:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 143:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 142:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 141:81] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 141:49] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 141:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 141:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 140:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 139:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 138:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 137:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 136:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 135:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 134:81] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 134:49] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 134:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 134:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 133:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 133:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 132:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 131:33] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 130:64] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 130:49] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 130:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 130:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 129:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 128:33] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 127:64] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 127:49] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 127:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 127:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 126:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 126:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 125:63] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 125:31] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 125:16] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 124:64] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 124:49] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 124:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 124:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 123:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 122:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 122:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 121:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 120:51] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 120:19] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 119:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 118:36] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 117:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 116:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 115:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 115:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 114:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 114:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 113:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 112:62] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 112:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 112:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 111:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 110:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 109:62] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 109:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 109:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 108:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 108:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 107:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 106:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 105:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 105:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 104:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 103:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 102:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 101:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 100:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 99:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 98:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 98:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 97:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 96:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 95:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 94:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 93:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 92:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 91:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 90:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 89:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 88:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 87:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 86:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 85:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 84:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 83:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 82:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 82:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 81:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 80:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 79:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 78:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 77:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 76:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 76:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 75:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 74:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 73:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 72:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 71:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 70:62] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 70:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 70:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 69:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 68:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 67:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 66:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 65:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 65:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 64:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 63:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 63:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 62:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 61:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 60:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 59:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 58:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 57:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 56:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 55:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 54:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 53:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 52:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 51:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 51:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 50:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 49:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 48:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 47:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 46:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 46:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 45:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 44:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 44:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 43:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 42:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 41:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 40:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 39:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 38:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 37:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 36:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 35:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 35:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 34:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 33:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 32:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 31:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 30:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 29:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 28:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 27:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 26:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 25:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 24:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 23:45] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 23:15] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 22:46] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 22:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 22:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 21:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 20:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 19:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 19:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 18:48] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 18:18] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 17:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 16:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 15:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 14:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 13:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 12:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 11:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 10:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 9:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 8:34] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 7:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 6:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 5:32] -// CHECK: 3:11: BinaryOperator= Extent=[3:11 - 4:32] -// CHECK: 3:12: BinaryOperator= Extent=[3:12 - 3:34] -// CHECK: 3:12: BinaryOperator= Extent=[3:12 - 3:21] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 160:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 160:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 159:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 158:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 158:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 157:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 156:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 155:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 154:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 153:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 152:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 152:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 151:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 151:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 150:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 149:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 148:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 147:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 146:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 145:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 144:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 144:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 143:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 142:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 141:81] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 141:49] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 141:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 141:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 140:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 139:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 138:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 137:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 136:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 135:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 134:81] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 134:49] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 134:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 134:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 133:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 133:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 132:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 131:33] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 130:64] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 130:49] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 130:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 130:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 129:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 128:33] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 127:64] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 127:49] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 127:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 127:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 126:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 126:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 125:63] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 125:31] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 125:16] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 124:64] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 124:49] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 124:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 124:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 123:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 122:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 122:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 121:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 120:51] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 120:19] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 119:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 118:36] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 117:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 116:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 115:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 115:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 114:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 114:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 113:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 112:62] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 112:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 112:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 111:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 110:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 109:62] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 109:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 109:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 108:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 108:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 107:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 106:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 105:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 105:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 104:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 103:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 102:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 101:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 100:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 99:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 98:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 98:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 97:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 96:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 95:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 94:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 93:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 92:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 91:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 90:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 89:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 88:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 87:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 86:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 85:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 84:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 83:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 82:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 82:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 81:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 80:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 79:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 78:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 77:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 76:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 76:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 75:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 74:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 73:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 72:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 71:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 70:62] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 70:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 70:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 69:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 68:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 67:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 66:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 65:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 65:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 64:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 63:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 63:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 62:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 61:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 60:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 59:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 58:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 57:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 56:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 55:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 54:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 53:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 52:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 51:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 51:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 50:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 49:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 48:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 47:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 46:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 46:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 45:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 44:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 44:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 43:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 42:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 41:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 40:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 39:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 38:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 37:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 36:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 35:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 35:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 34:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 33:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 32:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 31:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 30:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 29:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 28:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 27:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 26:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 25:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 24:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 23:45] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 23:15] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 22:46] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 22:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 22:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 21:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 20:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 19:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 19:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 18:48] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 18:18] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 17:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 16:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 15:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 14:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 13:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 12:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 11:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 10:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 9:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 8:34] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 7:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 6:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 5:32] +// CHECK: 3:11: BinaryOperator=|| Extent=[3:11 - 4:32] +// CHECK: 3:11: ParenExpr= Extent=[3:11 - 3:35] +// CHECK: 3:12: BinaryOperator=&& Extent=[3:12 - 3:34] +// CHECK: 3:12: BinaryOperator=>= Extent=[3:12 - 3:21] +// CHECK: 3:12: UnexposedExpr=c:2:14 Extent=[3:12 - 3:13] // CHECK: 3:12: DeclRefExpr=c:2:14 Extent=[3:12 - 3:13] // CHECK: 3:17: UnexposedExpr= Extent=[3:17 - 3:21] // CHECK: 3:17: IntegerLiteral= Extent=[3:17 - 3:21] -// CHECK: 3:25: BinaryOperator= Extent=[3:25 - 3:34] +// CHECK: 3:25: BinaryOperator=<= Extent=[3:25 - 3:34] +// CHECK: 3:25: UnexposedExpr=c:2:14 Extent=[3:25 - 3:26] // CHECK: 3:25: DeclRefExpr=c:2:14 Extent=[3:25 - 3:26] // CHECK: 3:30: UnexposedExpr= Extent=[3:30 - 3:34] // CHECK: 3:30: IntegerLiteral= Extent=[3:30 - 3:34] -// CHECK: 4:9: BinaryOperator= Extent=[4:9 - 4:31] -// CHECK: 4:9: BinaryOperator= Extent=[4:9 - 4:18] +// CHECK: 4:8: ParenExpr= Extent=[4:8 - 4:32] +// CHECK: 4:9: BinaryOperator=&& Extent=[4:9 - 4:31] +// CHECK: 4:9: BinaryOperator=>= Extent=[4:9 - 4:18] +// CHECK: 4:9: UnexposedExpr=c:2:14 Extent=[4:9 - 4:10] // CHECK: 4:9: DeclRefExpr=c:2:14 Extent=[4:9 - 4:10] // CHECK: 4:14: UnexposedExpr= Extent=[4:14 - 4:18] // CHECK: 4:14: IntegerLiteral= Extent=[4:14 - 4:18] -// CHECK: 4:22: BinaryOperator= Extent=[4:22 - 4:31] +// CHECK: 4:22: BinaryOperator=<= Extent=[4:22 - 4:31] +// CHECK: 4:22: UnexposedExpr=c:2:14 Extent=[4:22 - 4:23] // CHECK: 4:22: DeclRefExpr=c:2:14 Extent=[4:22 - 4:23] // CHECK: 4:27: UnexposedExpr= Extent=[4:27 - 4:31] // CHECK: 4:27: IntegerLiteral= Extent=[4:27 - 4:31] // CHECK: 5:8: ParenExpr= Extent=[5:8 - 5:32] -// CHECK: 5:9: BinaryOperator= Extent=[5:9 - 5:31] -// CHECK: 5:9: BinaryOperator= Extent=[5:9 - 5:18] +// CHECK: 5:9: BinaryOperator=&& Extent=[5:9 - 5:31] +// CHECK: 5:9: BinaryOperator=>= Extent=[5:9 - 5:18] +// CHECK: 5:9: UnexposedExpr=c:2:14 Extent=[5:9 - 5:10] // CHECK: 5:9: DeclRefExpr=c:2:14 Extent=[5:9 - 5:10] // CHECK: 5:14: UnexposedExpr= Extent=[5:14 - 5:18] // CHECK: 5:14: IntegerLiteral= Extent=[5:14 - 5:18] -// CHECK: 5:22: BinaryOperator= Extent=[5:22 - 5:31] +// CHECK: 5:22: BinaryOperator=<= Extent=[5:22 - 5:31] +// CHECK: 5:22: UnexposedExpr=c:2:14 Extent=[5:22 - 5:23] // CHECK: 5:22: DeclRefExpr=c:2:14 Extent=[5:22 - 5:23] // CHECK: 5:27: UnexposedExpr= Extent=[5:27 - 5:31] // CHECK: 5:27: IntegerLiteral= Extent=[5:27 - 5:31] -// CHECK: 6:9: BinaryOperator= Extent=[6:9 - 6:31] -// CHECK: 6:9: BinaryOperator= Extent=[6:9 - 6:18] +// CHECK: 6:8: ParenExpr= Extent=[6:8 - 6:32] +// CHECK: 6:9: BinaryOperator=&& Extent=[6:9 - 6:31] +// CHECK: 6:9: BinaryOperator=>= Extent=[6:9 - 6:18] +// CHECK: 6:9: UnexposedExpr=c:2:14 Extent=[6:9 - 6:10] // CHECK: 6:9: DeclRefExpr=c:2:14 Extent=[6:9 - 6:10] // CHECK: 6:14: UnexposedExpr= Extent=[6:14 - 6:18] // CHECK: 6:14: IntegerLiteral= Extent=[6:14 - 6:18] -// CHECK: 6:22: BinaryOperator= Extent=[6:22 - 6:31] +// CHECK: 6:22: BinaryOperator=<= Extent=[6:22 - 6:31] +// CHECK: 6:22: UnexposedExpr=c:2:14 Extent=[6:22 - 6:23] // CHECK: 6:22: DeclRefExpr=c:2:14 Extent=[6:22 - 6:23] // CHECK: 6:27: UnexposedExpr= Extent=[6:27 - 6:31] // CHECK: 6:27: IntegerLiteral= Extent=[6:27 - 6:31] -// CHECK: 7:9: BinaryOperator= Extent=[7:9 - 7:31] -// CHECK: 7:9: BinaryOperator= Extent=[7:9 - 7:18] +// CHECK: 7:8: ParenExpr= Extent=[7:8 - 7:32] +// CHECK: 7:9: BinaryOperator=&& Extent=[7:9 - 7:31] +// CHECK: 7:9: BinaryOperator=>= Extent=[7:9 - 7:18] +// CHECK: 7:9: UnexposedExpr=c:2:14 Extent=[7:9 - 7:10] // CHECK: 7:9: DeclRefExpr=c:2:14 Extent=[7:9 - 7:10] // CHECK: 7:14: UnexposedExpr= Extent=[7:14 - 7:18] // CHECK: 7:14: IntegerLiteral= Extent=[7:14 - 7:18] -// CHECK: 7:22: BinaryOperator= Extent=[7:22 - 7:31] +// CHECK: 7:22: BinaryOperator=<= Extent=[7:22 - 7:31] +// CHECK: 7:22: UnexposedExpr=c:2:14 Extent=[7:22 - 7:23] // CHECK: 7:22: DeclRefExpr=c:2:14 Extent=[7:22 - 7:23] // CHECK: 7:27: UnexposedExpr= Extent=[7:27 - 7:31] // CHECK: 7:27: IntegerLiteral= Extent=[7:27 - 7:31] -// CHECK: 8:9: BinaryOperator= Extent=[8:9 - 8:33] -// CHECK: 8:9: BinaryOperator= Extent=[8:9 - 8:19] +// CHECK: 8:8: ParenExpr= Extent=[8:8 - 8:34] +// CHECK: 8:9: BinaryOperator=&& Extent=[8:9 - 8:33] +// CHECK: 8:9: BinaryOperator=>= Extent=[8:9 - 8:19] +// CHECK: 8:9: UnexposedExpr=c:2:14 Extent=[8:9 - 8:10] // CHECK: 8:9: DeclRefExpr=c:2:14 Extent=[8:9 - 8:10] // CHECK: 8:14: UnexposedExpr= Extent=[8:14 - 8:19] // CHECK: 8:14: IntegerLiteral= Extent=[8:14 - 8:19] -// CHECK: 8:23: BinaryOperator= Extent=[8:23 - 8:33] +// CHECK: 8:23: BinaryOperator=<= Extent=[8:23 - 8:33] +// CHECK: 8:23: UnexposedExpr=c:2:14 Extent=[8:23 - 8:24] // CHECK: 8:23: DeclRefExpr=c:2:14 Extent=[8:23 - 8:24] // CHECK: 8:28: UnexposedExpr= Extent=[8:28 - 8:33] // CHECK: 8:28: IntegerLiteral= Extent=[8:28 - 8:33] -// CHECK: 9:9: BinaryOperator= Extent=[9:9 - 9:33] -// CHECK: 9:9: BinaryOperator= Extent=[9:9 - 9:19] +// CHECK: 9:8: ParenExpr= Extent=[9:8 - 9:34] +// CHECK: 9:9: BinaryOperator=&& Extent=[9:9 - 9:33] +// CHECK: 9:9: BinaryOperator=>= Extent=[9:9 - 9:19] +// CHECK: 9:9: UnexposedExpr=c:2:14 Extent=[9:9 - 9:10] // CHECK: 9:9: DeclRefExpr=c:2:14 Extent=[9:9 - 9:10] // CHECK: 9:14: UnexposedExpr= Extent=[9:14 - 9:19] // CHECK: 9:14: IntegerLiteral= Extent=[9:14 - 9:19] -// CHECK: 9:23: BinaryOperator= Extent=[9:23 - 9:33] +// CHECK: 9:23: BinaryOperator=<= Extent=[9:23 - 9:33] +// CHECK: 9:23: UnexposedExpr=c:2:14 Extent=[9:23 - 9:24] // CHECK: 9:23: DeclRefExpr=c:2:14 Extent=[9:23 - 9:24] // CHECK: 9:28: UnexposedExpr= Extent=[9:28 - 9:33] // CHECK: 9:28: IntegerLiteral= Extent=[9:28 - 9:33] -// CHECK: 10:9: BinaryOperator= Extent=[10:9 - 10:33] -// CHECK: 10:9: BinaryOperator= Extent=[10:9 - 10:19] +// CHECK: 10:8: ParenExpr= Extent=[10:8 - 10:34] +// CHECK: 10:9: BinaryOperator=&& Extent=[10:9 - 10:33] +// CHECK: 10:9: BinaryOperator=>= Extent=[10:9 - 10:19] +// CHECK: 10:9: UnexposedExpr=c:2:14 Extent=[10:9 - 10:10] // CHECK: 10:9: DeclRefExpr=c:2:14 Extent=[10:9 - 10:10] // CHECK: 10:14: UnexposedExpr= Extent=[10:14 - 10:19] // CHECK: 10:14: IntegerLiteral= Extent=[10:14 - 10:19] -// CHECK: 10:23: BinaryOperator= Extent=[10:23 - 10:33] +// CHECK: 10:23: BinaryOperator=<= Extent=[10:23 - 10:33] +// CHECK: 10:23: UnexposedExpr=c:2:14 Extent=[10:23 - 10:24] // CHECK: 10:23: DeclRefExpr=c:2:14 Extent=[10:23 - 10:24] // CHECK: 10:28: UnexposedExpr= Extent=[10:28 - 10:33] // CHECK: 10:28: IntegerLiteral= Extent=[10:28 - 10:33] -// CHECK: 11:9: BinaryOperator= Extent=[11:9 - 11:33] -// CHECK: 11:9: BinaryOperator= Extent=[11:9 - 11:19] +// CHECK: 11:8: ParenExpr= Extent=[11:8 - 11:34] +// CHECK: 11:9: BinaryOperator=&& Extent=[11:9 - 11:33] +// CHECK: 11:9: BinaryOperator=>= Extent=[11:9 - 11:19] +// CHECK: 11:9: UnexposedExpr=c:2:14 Extent=[11:9 - 11:10] // CHECK: 11:9: DeclRefExpr=c:2:14 Extent=[11:9 - 11:10] // CHECK: 11:14: UnexposedExpr= Extent=[11:14 - 11:19] // CHECK: 11:14: IntegerLiteral= Extent=[11:14 - 11:19] -// CHECK: 11:23: BinaryOperator= Extent=[11:23 - 11:33] +// CHECK: 11:23: BinaryOperator=<= Extent=[11:23 - 11:33] +// CHECK: 11:23: UnexposedExpr=c:2:14 Extent=[11:23 - 11:24] // CHECK: 11:23: DeclRefExpr=c:2:14 Extent=[11:23 - 11:24] // CHECK: 11:28: UnexposedExpr= Extent=[11:28 - 11:33] // CHECK: 11:28: IntegerLiteral= Extent=[11:28 - 11:33] -// CHECK: 12:9: BinaryOperator= Extent=[12:9 - 12:33] -// CHECK: 12:9: BinaryOperator= Extent=[12:9 - 12:19] +// CHECK: 12:8: ParenExpr= Extent=[12:8 - 12:34] +// CHECK: 12:9: BinaryOperator=&& Extent=[12:9 - 12:33] +// CHECK: 12:9: BinaryOperator=>= Extent=[12:9 - 12:19] +// CHECK: 12:9: UnexposedExpr=c:2:14 Extent=[12:9 - 12:10] // CHECK: 12:9: DeclRefExpr=c:2:14 Extent=[12:9 - 12:10] // CHECK: 12:14: UnexposedExpr= Extent=[12:14 - 12:19] // CHECK: 12:14: IntegerLiteral= Extent=[12:14 - 12:19] -// CHECK: 12:23: BinaryOperator= Extent=[12:23 - 12:33] +// CHECK: 12:23: BinaryOperator=<= Extent=[12:23 - 12:33] +// CHECK: 12:23: UnexposedExpr=c:2:14 Extent=[12:23 - 12:24] // CHECK: 12:23: DeclRefExpr=c:2:14 Extent=[12:23 - 12:24] // CHECK: 12:28: UnexposedExpr= Extent=[12:28 - 12:33] // CHECK: 12:28: IntegerLiteral= Extent=[12:28 - 12:33] -// CHECK: 13:9: BinaryOperator= Extent=[13:9 - 13:33] -// CHECK: 13:9: BinaryOperator= Extent=[13:9 - 13:19] +// CHECK: 13:8: ParenExpr= Extent=[13:8 - 13:34] +// CHECK: 13:9: BinaryOperator=&& Extent=[13:9 - 13:33] +// CHECK: 13:9: BinaryOperator=>= Extent=[13:9 - 13:19] +// CHECK: 13:9: UnexposedExpr=c:2:14 Extent=[13:9 - 13:10] // CHECK: 13:9: DeclRefExpr=c:2:14 Extent=[13:9 - 13:10] // CHECK: 13:14: UnexposedExpr= Extent=[13:14 - 13:19] // CHECK: 13:14: IntegerLiteral= Extent=[13:14 - 13:19] -// CHECK: 13:23: BinaryOperator= Extent=[13:23 - 13:33] +// CHECK: 13:23: BinaryOperator=<= Extent=[13:23 - 13:33] +// CHECK: 13:23: UnexposedExpr=c:2:14 Extent=[13:23 - 13:24] // CHECK: 13:23: DeclRefExpr=c:2:14 Extent=[13:23 - 13:24] // CHECK: 13:28: UnexposedExpr= Extent=[13:28 - 13:33] // CHECK: 13:28: IntegerLiteral= Extent=[13:28 - 13:33] -// CHECK: 14:9: BinaryOperator= Extent=[14:9 - 14:33] -// CHECK: 14:9: BinaryOperator= Extent=[14:9 - 14:19] +// CHECK: 14:8: ParenExpr= Extent=[14:8 - 14:34] +// CHECK: 14:9: BinaryOperator=&& Extent=[14:9 - 14:33] +// CHECK: 14:9: BinaryOperator=>= Extent=[14:9 - 14:19] +// CHECK: 14:9: UnexposedExpr=c:2:14 Extent=[14:9 - 14:10] // CHECK: 14:9: DeclRefExpr=c:2:14 Extent=[14:9 - 14:10] // CHECK: 14:14: UnexposedExpr= Extent=[14:14 - 14:19] // CHECK: 14:14: IntegerLiteral= Extent=[14:14 - 14:19] -// CHECK: 14:23: BinaryOperator= Extent=[14:23 - 14:33] +// CHECK: 14:23: BinaryOperator=<= Extent=[14:23 - 14:33] +// CHECK: 14:23: UnexposedExpr=c:2:14 Extent=[14:23 - 14:24] // CHECK: 14:23: DeclRefExpr=c:2:14 Extent=[14:23 - 14:24] // CHECK: 14:28: UnexposedExpr= Extent=[14:28 - 14:33] // CHECK: 14:28: IntegerLiteral= Extent=[14:28 - 14:33] -// CHECK: 15:9: BinaryOperator= Extent=[15:9 - 15:33] -// CHECK: 15:9: BinaryOperator= Extent=[15:9 - 15:19] +// CHECK: 15:8: ParenExpr= Extent=[15:8 - 15:34] +// CHECK: 15:9: BinaryOperator=&& Extent=[15:9 - 15:33] +// CHECK: 15:9: BinaryOperator=>= Extent=[15:9 - 15:19] +// CHECK: 15:9: UnexposedExpr=c:2:14 Extent=[15:9 - 15:10] // CHECK: 15:9: DeclRefExpr=c:2:14 Extent=[15:9 - 15:10] // CHECK: 15:14: UnexposedExpr= Extent=[15:14 - 15:19] // CHECK: 15:14: IntegerLiteral= Extent=[15:14 - 15:19] -// CHECK: 15:23: BinaryOperator= Extent=[15:23 - 15:33] +// CHECK: 15:23: BinaryOperator=<= Extent=[15:23 - 15:33] +// CHECK: 15:23: UnexposedExpr=c:2:14 Extent=[15:23 - 15:24] // CHECK: 15:23: DeclRefExpr=c:2:14 Extent=[15:23 - 15:24] // CHECK: 15:28: UnexposedExpr= Extent=[15:28 - 15:33] // CHECK: 15:28: IntegerLiteral= Extent=[15:28 - 15:33] -// CHECK: 16:9: BinaryOperator= Extent=[16:9 - 16:33] -// CHECK: 16:9: BinaryOperator= Extent=[16:9 - 16:19] +// CHECK: 16:8: ParenExpr= Extent=[16:8 - 16:34] +// CHECK: 16:9: BinaryOperator=&& Extent=[16:9 - 16:33] +// CHECK: 16:9: BinaryOperator=>= Extent=[16:9 - 16:19] +// CHECK: 16:9: UnexposedExpr=c:2:14 Extent=[16:9 - 16:10] // CHECK: 16:9: DeclRefExpr=c:2:14 Extent=[16:9 - 16:10] // CHECK: 16:14: UnexposedExpr= Extent=[16:14 - 16:19] // CHECK: 16:14: IntegerLiteral= Extent=[16:14 - 16:19] -// CHECK: 16:23: BinaryOperator= Extent=[16:23 - 16:33] +// CHECK: 16:23: BinaryOperator=<= Extent=[16:23 - 16:33] +// CHECK: 16:23: UnexposedExpr=c:2:14 Extent=[16:23 - 16:24] // CHECK: 16:23: DeclRefExpr=c:2:14 Extent=[16:23 - 16:24] // CHECK: 16:28: UnexposedExpr= Extent=[16:28 - 16:33] // CHECK: 16:28: IntegerLiteral= Extent=[16:28 - 16:33] -// CHECK: 17:9: BinaryOperator= Extent=[17:9 - 17:33] -// CHECK: 17:9: BinaryOperator= Extent=[17:9 - 17:19] +// CHECK: 17:8: ParenExpr= Extent=[17:8 - 17:34] +// CHECK: 17:9: BinaryOperator=&& Extent=[17:9 - 17:33] +// CHECK: 17:9: BinaryOperator=>= Extent=[17:9 - 17:19] +// CHECK: 17:9: UnexposedExpr=c:2:14 Extent=[17:9 - 17:10] // CHECK: 17:9: DeclRefExpr=c:2:14 Extent=[17:9 - 17:10] // CHECK: 17:14: UnexposedExpr= Extent=[17:14 - 17:19] // CHECK: 17:14: IntegerLiteral= Extent=[17:14 - 17:19] -// CHECK: 17:23: BinaryOperator= Extent=[17:23 - 17:33] +// CHECK: 17:23: BinaryOperator=<= Extent=[17:23 - 17:33] +// CHECK: 17:23: UnexposedExpr=c:2:14 Extent=[17:23 - 17:24] // CHECK: 17:23: DeclRefExpr=c:2:14 Extent=[17:23 - 17:24] // CHECK: 17:28: UnexposedExpr= Extent=[17:28 - 17:33] // CHECK: 17:28: IntegerLiteral= Extent=[17:28 - 17:33] -// CHECK: 18:8: BinaryOperator= Extent=[18:8 - 18:18] +// CHECK: 18:8: BinaryOperator=== Extent=[18:8 - 18:18] +// CHECK: 18:8: UnexposedExpr=c:2:14 Extent=[18:8 - 18:9] // CHECK: 18:8: DeclRefExpr=c:2:14 Extent=[18:8 - 18:9] // CHECK: 18:13: UnexposedExpr= Extent=[18:13 - 18:18] // CHECK: 18:13: IntegerLiteral= Extent=[18:13 - 18:18] -// CHECK: 18:23: BinaryOperator= Extent=[18:23 - 18:47] -// CHECK: 18:23: BinaryOperator= Extent=[18:23 - 18:33] +// CHECK: 18:22: ParenExpr= Extent=[18:22 - 18:48] +// CHECK: 18:23: BinaryOperator=&& Extent=[18:23 - 18:47] +// CHECK: 18:23: BinaryOperator=>= Extent=[18:23 - 18:33] +// CHECK: 18:23: UnexposedExpr=c:2:14 Extent=[18:23 - 18:24] // CHECK: 18:23: DeclRefExpr=c:2:14 Extent=[18:23 - 18:24] // CHECK: 18:28: UnexposedExpr= Extent=[18:28 - 18:33] // CHECK: 18:28: IntegerLiteral= Extent=[18:28 - 18:33] -// CHECK: 18:37: BinaryOperator= Extent=[18:37 - 18:47] +// CHECK: 18:37: BinaryOperator=<= Extent=[18:37 - 18:47] +// CHECK: 18:37: UnexposedExpr=c:2:14 Extent=[18:37 - 18:38] // CHECK: 18:37: DeclRefExpr=c:2:14 Extent=[18:37 - 18:38] // CHECK: 18:42: UnexposedExpr= Extent=[18:42 - 18:47] // CHECK: 18:42: IntegerLiteral= Extent=[18:42 - 18:47] -// CHECK: 19:8: BinaryOperator= Extent=[19:8 - 19:18] +// CHECK: 19:8: BinaryOperator=== Extent=[19:8 - 19:18] +// CHECK: 19:8: UnexposedExpr=c:2:14 Extent=[19:8 - 19:9] // CHECK: 19:8: DeclRefExpr=c:2:14 Extent=[19:8 - 19:9] // CHECK: 19:13: UnexposedExpr= Extent=[19:13 - 19:18] // CHECK: 19:13: IntegerLiteral= Extent=[19:13 - 19:18] -// CHECK: 19:23: BinaryOperator= Extent=[19:23 - 19:47] -// CHECK: 19:23: BinaryOperator= Extent=[19:23 - 19:33] +// CHECK: 19:22: ParenExpr= Extent=[19:22 - 19:48] +// CHECK: 19:23: BinaryOperator=&& Extent=[19:23 - 19:47] +// CHECK: 19:23: BinaryOperator=>= Extent=[19:23 - 19:33] +// CHECK: 19:23: UnexposedExpr=c:2:14 Extent=[19:23 - 19:24] // CHECK: 19:23: DeclRefExpr=c:2:14 Extent=[19:23 - 19:24] // CHECK: 19:28: UnexposedExpr= Extent=[19:28 - 19:33] // CHECK: 19:28: IntegerLiteral= Extent=[19:28 - 19:33] -// CHECK: 19:37: BinaryOperator= Extent=[19:37 - 19:47] +// CHECK: 19:37: BinaryOperator=<= Extent=[19:37 - 19:47] +// CHECK: 19:37: UnexposedExpr=c:2:14 Extent=[19:37 - 19:38] // CHECK: 19:37: DeclRefExpr=c:2:14 Extent=[19:37 - 19:38] // CHECK: 19:42: UnexposedExpr= Extent=[19:42 - 19:47] // CHECK: 19:42: IntegerLiteral= Extent=[19:42 - 19:47] -// CHECK: 20:9: BinaryOperator= Extent=[20:9 - 20:33] -// CHECK: 20:9: BinaryOperator= Extent=[20:9 - 20:19] +// CHECK: 20:8: ParenExpr= Extent=[20:8 - 20:34] +// CHECK: 20:9: BinaryOperator=&& Extent=[20:9 - 20:33] +// CHECK: 20:9: BinaryOperator=>= Extent=[20:9 - 20:19] +// CHECK: 20:9: UnexposedExpr=c:2:14 Extent=[20:9 - 20:10] // CHECK: 20:9: DeclRefExpr=c:2:14 Extent=[20:9 - 20:10] // CHECK: 20:14: UnexposedExpr= Extent=[20:14 - 20:19] // CHECK: 20:14: IntegerLiteral= Extent=[20:14 - 20:19] -// CHECK: 20:23: BinaryOperator= Extent=[20:23 - 20:33] +// CHECK: 20:23: BinaryOperator=<= Extent=[20:23 - 20:33] +// CHECK: 20:23: UnexposedExpr=c:2:14 Extent=[20:23 - 20:24] // CHECK: 20:23: DeclRefExpr=c:2:14 Extent=[20:23 - 20:24] // CHECK: 20:28: UnexposedExpr= Extent=[20:28 - 20:33] // CHECK: 20:28: IntegerLiteral= Extent=[20:28 - 20:33] -// CHECK: 21:9: BinaryOperator= Extent=[21:9 - 21:33] -// CHECK: 21:9: BinaryOperator= Extent=[21:9 - 21:19] +// CHECK: 21:8: ParenExpr= Extent=[21:8 - 21:34] +// CHECK: 21:9: BinaryOperator=&& Extent=[21:9 - 21:33] +// CHECK: 21:9: BinaryOperator=>= Extent=[21:9 - 21:19] +// CHECK: 21:9: UnexposedExpr=c:2:14 Extent=[21:9 - 21:10] // CHECK: 21:9: DeclRefExpr=c:2:14 Extent=[21:9 - 21:10] // CHECK: 21:14: UnexposedExpr= Extent=[21:14 - 21:19] // CHECK: 21:14: IntegerLiteral= Extent=[21:14 - 21:19] -// CHECK: 21:23: BinaryOperator= Extent=[21:23 - 21:33] +// CHECK: 21:23: BinaryOperator=<= Extent=[21:23 - 21:33] +// CHECK: 21:23: UnexposedExpr=c:2:14 Extent=[21:23 - 21:24] // CHECK: 21:23: DeclRefExpr=c:2:14 Extent=[21:23 - 21:24] // CHECK: 21:28: UnexposedExpr= Extent=[21:28 - 21:33] // CHECK: 21:28: IntegerLiteral= Extent=[21:28 - 21:33] -// CHECK: 22:8: BinaryOperator= Extent=[22:8 - 22:18] +// CHECK: 22:8: BinaryOperator=== Extent=[22:8 - 22:18] +// CHECK: 22:8: UnexposedExpr=c:2:14 Extent=[22:8 - 22:9] // CHECK: 22:8: DeclRefExpr=c:2:14 Extent=[22:8 - 22:9] // CHECK: 22:13: UnexposedExpr= Extent=[22:13 - 22:18] // CHECK: 22:13: IntegerLiteral= Extent=[22:13 - 22:18] -// CHECK: 22:22: BinaryOperator= Extent=[22:22 - 22:32] +// CHECK: 22:22: BinaryOperator=== Extent=[22:22 - 22:32] +// CHECK: 22:22: UnexposedExpr=c:2:14 Extent=[22:22 - 22:23] // CHECK: 22:22: DeclRefExpr=c:2:14 Extent=[22:22 - 22:23] // CHECK: 22:27: UnexposedExpr= Extent=[22:27 - 22:32] // CHECK: 22:27: IntegerLiteral= Extent=[22:27 - 22:32] -// CHECK: 22:36: BinaryOperator= Extent=[22:36 - 22:46] +// CHECK: 22:36: BinaryOperator=== Extent=[22:36 - 22:46] +// CHECK: 22:36: UnexposedExpr=c:2:14 Extent=[22:36 - 22:37] // CHECK: 22:36: DeclRefExpr=c:2:14 Extent=[22:36 - 22:37] // CHECK: 22:41: UnexposedExpr= Extent=[22:41 - 22:46] // CHECK: 22:41: IntegerLiteral= Extent=[22:41 - 22:46] -// CHECK: 23:5: BinaryOperator= Extent=[23:5 - 23:15] +// CHECK: 23:5: BinaryOperator=== Extent=[23:5 - 23:15] +// CHECK: 23:5: UnexposedExpr=c:2:14 Extent=[23:5 - 23:6] // CHECK: 23:5: DeclRefExpr=c:2:14 Extent=[23:5 - 23:6] // CHECK: 23:10: UnexposedExpr= Extent=[23:10 - 23:15] // CHECK: 23:10: IntegerLiteral= Extent=[23:10 - 23:15] -// CHECK: 23:20: BinaryOperator= Extent=[23:20 - 23:44] -// CHECK: 23:20: BinaryOperator= Extent=[23:20 - 23:30] +// CHECK: 23:19: ParenExpr= Extent=[23:19 - 23:45] +// CHECK: 23:20: BinaryOperator=&& Extent=[23:20 - 23:44] +// CHECK: 23:20: BinaryOperator=>= Extent=[23:20 - 23:30] +// CHECK: 23:20: UnexposedExpr=c:2:14 Extent=[23:20 - 23:21] // CHECK: 23:20: DeclRefExpr=c:2:14 Extent=[23:20 - 23:21] // CHECK: 23:25: UnexposedExpr= Extent=[23:25 - 23:30] // CHECK: 23:25: IntegerLiteral= Extent=[23:25 - 23:30] -// CHECK: 23:34: BinaryOperator= Extent=[23:34 - 23:44] +// CHECK: 23:34: BinaryOperator=<= Extent=[23:34 - 23:44] +// CHECK: 23:34: UnexposedExpr=c:2:14 Extent=[23:34 - 23:35] // CHECK: 23:34: DeclRefExpr=c:2:14 Extent=[23:34 - 23:35] // CHECK: 23:39: UnexposedExpr= Extent=[23:39 - 23:44] // CHECK: 23:39: IntegerLiteral= Extent=[23:39 - 23:44] -// CHECK: 24:9: BinaryOperator= Extent=[24:9 - 24:33] -// CHECK: 24:9: BinaryOperator= Extent=[24:9 - 24:19] +// CHECK: 24:8: ParenExpr= Extent=[24:8 - 24:34] +// CHECK: 24:9: BinaryOperator=&& Extent=[24:9 - 24:33] +// CHECK: 24:9: BinaryOperator=>= Extent=[24:9 - 24:19] +// CHECK: 24:9: UnexposedExpr=c:2:14 Extent=[24:9 - 24:10] // CHECK: 24:9: DeclRefExpr=c:2:14 Extent=[24:9 - 24:10] // CHECK: 24:14: UnexposedExpr= Extent=[24:14 - 24:19] // CHECK: 24:14: IntegerLiteral= Extent=[24:14 - 24:19] -// CHECK: 24:23: BinaryOperator= Extent=[24:23 - 24:33] +// CHECK: 24:23: BinaryOperator=<= Extent=[24:23 - 24:33] +// CHECK: 24:23: UnexposedExpr=c:2:14 Extent=[24:23 - 24:24] // CHECK: 24:23: DeclRefExpr=c:2:14 Extent=[24:23 - 24:24] // CHECK: 24:28: UnexposedExpr= Extent=[24:28 - 24:33] // CHECK: 24:28: IntegerLiteral= Extent=[24:28 - 24:33] -// CHECK: 25:9: BinaryOperator= Extent=[25:9 - 25:33] -// CHECK: 25:9: BinaryOperator= Extent=[25:9 - 25:19] +// CHECK: 25:8: ParenExpr= Extent=[25:8 - 25:34] +// CHECK: 25:9: BinaryOperator=&& Extent=[25:9 - 25:33] +// CHECK: 25:9: BinaryOperator=>= Extent=[25:9 - 25:19] +// CHECK: 25:9: UnexposedExpr=c:2:14 Extent=[25:9 - 25:10] // CHECK: 25:9: DeclRefExpr=c:2:14 Extent=[25:9 - 25:10] // CHECK: 25:14: UnexposedExpr= Extent=[25:14 - 25:19] // CHECK: 25:14: IntegerLiteral= Extent=[25:14 - 25:19] -// CHECK: 25:23: BinaryOperator= Extent=[25:23 - 25:33] +// CHECK: 25:23: BinaryOperator=<= Extent=[25:23 - 25:33] +// CHECK: 25:23: UnexposedExpr=c:2:14 Extent=[25:23 - 25:24] // CHECK: 25:23: DeclRefExpr=c:2:14 Extent=[25:23 - 25:24] // CHECK: 25:28: UnexposedExpr= Extent=[25:28 - 25:33] // CHECK: 25:28: IntegerLiteral= Extent=[25:28 - 25:33] -// CHECK: 26:9: BinaryOperator= Extent=[26:9 - 26:33] -// CHECK: 26:9: BinaryOperator= Extent=[26:9 - 26:19] +// CHECK: 26:8: ParenExpr= Extent=[26:8 - 26:34] +// CHECK: 26:9: BinaryOperator=&& Extent=[26:9 - 26:33] +// CHECK: 26:9: BinaryOperator=>= Extent=[26:9 - 26:19] +// CHECK: 26:9: UnexposedExpr=c:2:14 Extent=[26:9 - 26:10] // CHECK: 26:9: DeclRefExpr=c:2:14 Extent=[26:9 - 26:10] // CHECK: 26:14: UnexposedExpr= Extent=[26:14 - 26:19] // CHECK: 26:14: IntegerLiteral= Extent=[26:14 - 26:19] -// CHECK: 26:23: BinaryOperator= Extent=[26:23 - 26:33] +// CHECK: 26:23: BinaryOperator=<= Extent=[26:23 - 26:33] +// CHECK: 26:23: UnexposedExpr=c:2:14 Extent=[26:23 - 26:24] // CHECK: 26:23: DeclRefExpr=c:2:14 Extent=[26:23 - 26:24] // CHECK: 26:28: UnexposedExpr= Extent=[26:28 - 26:33] // CHECK: 26:28: IntegerLiteral= Extent=[26:28 - 26:33] -// CHECK: 27:9: BinaryOperator= Extent=[27:9 - 27:33] -// CHECK: 27:9: BinaryOperator= Extent=[27:9 - 27:19] +// CHECK: 27:8: ParenExpr= Extent=[27:8 - 27:34] +// CHECK: 27:9: BinaryOperator=&& Extent=[27:9 - 27:33] +// CHECK: 27:9: BinaryOperator=>= Extent=[27:9 - 27:19] +// CHECK: 27:9: UnexposedExpr=c:2:14 Extent=[27:9 - 27:10] // CHECK: 27:9: DeclRefExpr=c:2:14 Extent=[27:9 - 27:10] // CHECK: 27:14: UnexposedExpr= Extent=[27:14 - 27:19] // CHECK: 27:14: IntegerLiteral= Extent=[27:14 - 27:19] -// CHECK: 27:23: BinaryOperator= Extent=[27:23 - 27:33] +// CHECK: 27:23: BinaryOperator=<= Extent=[27:23 - 27:33] +// CHECK: 27:23: UnexposedExpr=c:2:14 Extent=[27:23 - 27:24] // CHECK: 27:23: DeclRefExpr=c:2:14 Extent=[27:23 - 27:24] // CHECK: 27:28: UnexposedExpr= Extent=[27:28 - 27:33] // CHECK: 27:28: IntegerLiteral= Extent=[27:28 - 27:33] -// CHECK: 28:9: BinaryOperator= Extent=[28:9 - 28:33] -// CHECK: 28:9: BinaryOperator= Extent=[28:9 - 28:19] +// CHECK: 28:8: ParenExpr= Extent=[28:8 - 28:34] +// CHECK: 28:9: BinaryOperator=&& Extent=[28:9 - 28:33] +// CHECK: 28:9: BinaryOperator=>= Extent=[28:9 - 28:19] +// CHECK: 28:9: UnexposedExpr=c:2:14 Extent=[28:9 - 28:10] // CHECK: 28:9: DeclRefExpr=c:2:14 Extent=[28:9 - 28:10] // CHECK: 28:14: UnexposedExpr= Extent=[28:14 - 28:19] // CHECK: 28:14: IntegerLiteral= Extent=[28:14 - 28:19] -// CHECK: 28:23: BinaryOperator= Extent=[28:23 - 28:33] +// CHECK: 28:23: BinaryOperator=<= Extent=[28:23 - 28:33] +// CHECK: 28:23: UnexposedExpr=c:2:14 Extent=[28:23 - 28:24] // CHECK: 28:23: DeclRefExpr=c:2:14 Extent=[28:23 - 28:24] // CHECK: 28:28: UnexposedExpr= Extent=[28:28 - 28:33] // CHECK: 28:28: IntegerLiteral= Extent=[28:28 - 28:33] -// CHECK: 29:9: BinaryOperator= Extent=[29:9 - 29:33] -// CHECK: 29:9: BinaryOperator= Extent=[29:9 - 29:19] +// CHECK: 29:8: ParenExpr= Extent=[29:8 - 29:34] +// CHECK: 29:9: BinaryOperator=&& Extent=[29:9 - 29:33] +// CHECK: 29:9: BinaryOperator=>= Extent=[29:9 - 29:19] +// CHECK: 29:9: UnexposedExpr=c:2:14 Extent=[29:9 - 29:10] // CHECK: 29:9: DeclRefExpr=c:2:14 Extent=[29:9 - 29:10] // CHECK: 29:14: UnexposedExpr= Extent=[29:14 - 29:19] // CHECK: 29:14: IntegerLiteral= Extent=[29:14 - 29:19] -// CHECK: 29:23: BinaryOperator= Extent=[29:23 - 29:33] +// CHECK: 29:23: BinaryOperator=<= Extent=[29:23 - 29:33] +// CHECK: 29:23: UnexposedExpr=c:2:14 Extent=[29:23 - 29:24] // CHECK: 29:23: DeclRefExpr=c:2:14 Extent=[29:23 - 29:24] // CHECK: 29:28: UnexposedExpr= Extent=[29:28 - 29:33] // CHECK: 29:28: IntegerLiteral= Extent=[29:28 - 29:33] -// CHECK: 30:9: BinaryOperator= Extent=[30:9 - 30:33] -// CHECK: 30:9: BinaryOperator= Extent=[30:9 - 30:19] +// CHECK: 30:8: ParenExpr= Extent=[30:8 - 30:34] +// CHECK: 30:9: BinaryOperator=&& Extent=[30:9 - 30:33] +// CHECK: 30:9: BinaryOperator=>= Extent=[30:9 - 30:19] +// CHECK: 30:9: UnexposedExpr=c:2:14 Extent=[30:9 - 30:10] // CHECK: 30:9: DeclRefExpr=c:2:14 Extent=[30:9 - 30:10] // CHECK: 30:14: UnexposedExpr= Extent=[30:14 - 30:19] // CHECK: 30:14: IntegerLiteral= Extent=[30:14 - 30:19] -// CHECK: 30:23: BinaryOperator= Extent=[30:23 - 30:33] +// CHECK: 30:23: BinaryOperator=<= Extent=[30:23 - 30:33] +// CHECK: 30:23: UnexposedExpr=c:2:14 Extent=[30:23 - 30:24] // CHECK: 30:23: DeclRefExpr=c:2:14 Extent=[30:23 - 30:24] // CHECK: 30:28: UnexposedExpr= Extent=[30:28 - 30:33] // CHECK: 30:28: IntegerLiteral= Extent=[30:28 - 30:33] -// CHECK: 31:9: BinaryOperator= Extent=[31:9 - 31:33] -// CHECK: 31:9: BinaryOperator= Extent=[31:9 - 31:19] +// CHECK: 31:8: ParenExpr= Extent=[31:8 - 31:34] +// CHECK: 31:9: BinaryOperator=&& Extent=[31:9 - 31:33] +// CHECK: 31:9: BinaryOperator=>= Extent=[31:9 - 31:19] +// CHECK: 31:9: UnexposedExpr=c:2:14 Extent=[31:9 - 31:10] // CHECK: 31:9: DeclRefExpr=c:2:14 Extent=[31:9 - 31:10] // CHECK: 31:14: UnexposedExpr= Extent=[31:14 - 31:19] // CHECK: 31:14: IntegerLiteral= Extent=[31:14 - 31:19] -// CHECK: 31:23: BinaryOperator= Extent=[31:23 - 31:33] +// CHECK: 31:23: BinaryOperator=<= Extent=[31:23 - 31:33] +// CHECK: 31:23: UnexposedExpr=c:2:14 Extent=[31:23 - 31:24] // CHECK: 31:23: DeclRefExpr=c:2:14 Extent=[31:23 - 31:24] // CHECK: 31:28: UnexposedExpr= Extent=[31:28 - 31:33] // CHECK: 31:28: IntegerLiteral= Extent=[31:28 - 31:33] -// CHECK: 32:9: BinaryOperator= Extent=[32:9 - 32:33] -// CHECK: 32:9: BinaryOperator= Extent=[32:9 - 32:19] +// CHECK: 32:8: ParenExpr= Extent=[32:8 - 32:34] +// CHECK: 32:9: BinaryOperator=&& Extent=[32:9 - 32:33] +// CHECK: 32:9: BinaryOperator=>= Extent=[32:9 - 32:19] +// CHECK: 32:9: UnexposedExpr=c:2:14 Extent=[32:9 - 32:10] // CHECK: 32:9: DeclRefExpr=c:2:14 Extent=[32:9 - 32:10] // CHECK: 32:14: UnexposedExpr= Extent=[32:14 - 32:19] // CHECK: 32:14: IntegerLiteral= Extent=[32:14 - 32:19] -// CHECK: 32:23: BinaryOperator= Extent=[32:23 - 32:33] +// CHECK: 32:23: BinaryOperator=<= Extent=[32:23 - 32:33] +// CHECK: 32:23: UnexposedExpr=c:2:14 Extent=[32:23 - 32:24] // CHECK: 32:23: DeclRefExpr=c:2:14 Extent=[32:23 - 32:24] // CHECK: 32:28: UnexposedExpr= Extent=[32:28 - 32:33] // CHECK: 32:28: IntegerLiteral= Extent=[32:28 - 32:33] -// CHECK: 33:9: BinaryOperator= Extent=[33:9 - 33:33] -// CHECK: 33:9: BinaryOperator= Extent=[33:9 - 33:19] +// CHECK: 33:8: ParenExpr= Extent=[33:8 - 33:34] +// CHECK: 33:9: BinaryOperator=&& Extent=[33:9 - 33:33] +// CHECK: 33:9: BinaryOperator=>= Extent=[33:9 - 33:19] +// CHECK: 33:9: UnexposedExpr=c:2:14 Extent=[33:9 - 33:10] // CHECK: 33:9: DeclRefExpr=c:2:14 Extent=[33:9 - 33:10] // CHECK: 33:14: UnexposedExpr= Extent=[33:14 - 33:19] // CHECK: 33:14: IntegerLiteral= Extent=[33:14 - 33:19] -// CHECK: 33:23: BinaryOperator= Extent=[33:23 - 33:33] +// CHECK: 33:23: BinaryOperator=<= Extent=[33:23 - 33:33] +// CHECK: 33:23: UnexposedExpr=c:2:14 Extent=[33:23 - 33:24] // CHECK: 33:23: DeclRefExpr=c:2:14 Extent=[33:23 - 33:24] // CHECK: 33:28: UnexposedExpr= Extent=[33:28 - 33:33] // CHECK: 33:28: IntegerLiteral= Extent=[33:28 - 33:33] -// CHECK: 34:9: BinaryOperator= Extent=[34:9 - 34:33] -// CHECK: 34:9: BinaryOperator= Extent=[34:9 - 34:19] +// CHECK: 34:8: ParenExpr= Extent=[34:8 - 34:34] +// CHECK: 34:9: BinaryOperator=&& Extent=[34:9 - 34:33] +// CHECK: 34:9: BinaryOperator=>= Extent=[34:9 - 34:19] +// CHECK: 34:9: UnexposedExpr=c:2:14 Extent=[34:9 - 34:10] // CHECK: 34:9: DeclRefExpr=c:2:14 Extent=[34:9 - 34:10] // CHECK: 34:14: UnexposedExpr= Extent=[34:14 - 34:19] // CHECK: 34:14: IntegerLiteral= Extent=[34:14 - 34:19] -// CHECK: 34:23: BinaryOperator= Extent=[34:23 - 34:33] +// CHECK: 34:23: BinaryOperator=<= Extent=[34:23 - 34:33] +// CHECK: 34:23: UnexposedExpr=c:2:14 Extent=[34:23 - 34:24] // CHECK: 34:23: DeclRefExpr=c:2:14 Extent=[34:23 - 34:24] // CHECK: 34:28: UnexposedExpr= Extent=[34:28 - 34:33] // CHECK: 34:28: IntegerLiteral= Extent=[34:28 - 34:33] -// CHECK: 35:8: BinaryOperator= Extent=[35:8 - 35:18] +// CHECK: 35:8: BinaryOperator=== Extent=[35:8 - 35:18] +// CHECK: 35:8: UnexposedExpr=c:2:14 Extent=[35:8 - 35:9] // CHECK: 35:8: DeclRefExpr=c:2:14 Extent=[35:8 - 35:9] // CHECK: 35:13: UnexposedExpr= Extent=[35:13 - 35:18] // CHECK: 35:13: IntegerLiteral= Extent=[35:13 - 35:18] -// CHECK: 35:23: BinaryOperator= Extent=[35:23 - 35:47] -// CHECK: 35:23: BinaryOperator= Extent=[35:23 - 35:33] +// CHECK: 35:22: ParenExpr= Extent=[35:22 - 35:48] +// CHECK: 35:23: BinaryOperator=&& Extent=[35:23 - 35:47] +// CHECK: 35:23: BinaryOperator=>= Extent=[35:23 - 35:33] +// CHECK: 35:23: UnexposedExpr=c:2:14 Extent=[35:23 - 35:24] // CHECK: 35:23: DeclRefExpr=c:2:14 Extent=[35:23 - 35:24] // CHECK: 35:28: UnexposedExpr= Extent=[35:28 - 35:33] // CHECK: 35:28: IntegerLiteral= Extent=[35:28 - 35:33] -// CHECK: 35:37: BinaryOperator= Extent=[35:37 - 35:47] +// CHECK: 35:37: BinaryOperator=<= Extent=[35:37 - 35:47] +// CHECK: 35:37: UnexposedExpr=c:2:14 Extent=[35:37 - 35:38] // CHECK: 35:37: DeclRefExpr=c:2:14 Extent=[35:37 - 35:38] // CHECK: 35:42: UnexposedExpr= Extent=[35:42 - 35:47] // CHECK: 35:42: IntegerLiteral= Extent=[35:42 - 35:47] -// CHECK: 36:9: BinaryOperator= Extent=[36:9 - 36:33] -// CHECK: 36:9: BinaryOperator= Extent=[36:9 - 36:19] +// CHECK: 36:8: ParenExpr= Extent=[36:8 - 36:34] +// CHECK: 36:9: BinaryOperator=&& Extent=[36:9 - 36:33] +// CHECK: 36:9: BinaryOperator=>= Extent=[36:9 - 36:19] +// CHECK: 36:9: UnexposedExpr=c:2:14 Extent=[36:9 - 36:10] // CHECK: 36:9: DeclRefExpr=c:2:14 Extent=[36:9 - 36:10] // CHECK: 36:14: UnexposedExpr= Extent=[36:14 - 36:19] // CHECK: 36:14: IntegerLiteral= Extent=[36:14 - 36:19] -// CHECK: 36:23: BinaryOperator= Extent=[36:23 - 36:33] +// CHECK: 36:23: BinaryOperator=<= Extent=[36:23 - 36:33] +// CHECK: 36:23: UnexposedExpr=c:2:14 Extent=[36:23 - 36:24] // CHECK: 36:23: DeclRefExpr=c:2:14 Extent=[36:23 - 36:24] // CHECK: 36:28: UnexposedExpr= Extent=[36:28 - 36:33] // CHECK: 36:28: IntegerLiteral= Extent=[36:28 - 36:33] -// CHECK: 37:9: BinaryOperator= Extent=[37:9 - 37:33] -// CHECK: 37:9: BinaryOperator= Extent=[37:9 - 37:19] +// CHECK: 37:8: ParenExpr= Extent=[37:8 - 37:34] +// CHECK: 37:9: BinaryOperator=&& Extent=[37:9 - 37:33] +// CHECK: 37:9: BinaryOperator=>= Extent=[37:9 - 37:19] +// CHECK: 37:9: UnexposedExpr=c:2:14 Extent=[37:9 - 37:10] // CHECK: 37:9: DeclRefExpr=c:2:14 Extent=[37:9 - 37:10] // CHECK: 37:14: UnexposedExpr= Extent=[37:14 - 37:19] // CHECK: 37:14: IntegerLiteral= Extent=[37:14 - 37:19] -// CHECK: 37:23: BinaryOperator= Extent=[37:23 - 37:33] +// CHECK: 37:23: BinaryOperator=<= Extent=[37:23 - 37:33] +// CHECK: 37:23: UnexposedExpr=c:2:14 Extent=[37:23 - 37:24] // CHECK: 37:23: DeclRefExpr=c:2:14 Extent=[37:23 - 37:24] // CHECK: 37:28: UnexposedExpr= Extent=[37:28 - 37:33] // CHECK: 37:28: IntegerLiteral= Extent=[37:28 - 37:33] -// CHECK: 38:9: BinaryOperator= Extent=[38:9 - 38:33] -// CHECK: 38:9: BinaryOperator= Extent=[38:9 - 38:19] +// CHECK: 38:8: ParenExpr= Extent=[38:8 - 38:34] +// CHECK: 38:9: BinaryOperator=&& Extent=[38:9 - 38:33] +// CHECK: 38:9: BinaryOperator=>= Extent=[38:9 - 38:19] +// CHECK: 38:9: UnexposedExpr=c:2:14 Extent=[38:9 - 38:10] // CHECK: 38:9: DeclRefExpr=c:2:14 Extent=[38:9 - 38:10] // CHECK: 38:14: UnexposedExpr= Extent=[38:14 - 38:19] // CHECK: 38:14: IntegerLiteral= Extent=[38:14 - 38:19] -// CHECK: 38:23: BinaryOperator= Extent=[38:23 - 38:33] +// CHECK: 38:23: BinaryOperator=<= Extent=[38:23 - 38:33] +// CHECK: 38:23: UnexposedExpr=c:2:14 Extent=[38:23 - 38:24] // CHECK: 38:23: DeclRefExpr=c:2:14 Extent=[38:23 - 38:24] // CHECK: 38:28: UnexposedExpr= Extent=[38:28 - 38:33] // CHECK: 38:28: IntegerLiteral= Extent=[38:28 - 38:33] -// CHECK: 39:9: BinaryOperator= Extent=[39:9 - 39:33] -// CHECK: 39:9: BinaryOperator= Extent=[39:9 - 39:19] +// CHECK: 39:8: ParenExpr= Extent=[39:8 - 39:34] +// CHECK: 39:9: BinaryOperator=&& Extent=[39:9 - 39:33] +// CHECK: 39:9: BinaryOperator=>= Extent=[39:9 - 39:19] +// CHECK: 39:9: UnexposedExpr=c:2:14 Extent=[39:9 - 39:10] // CHECK: 39:9: DeclRefExpr=c:2:14 Extent=[39:9 - 39:10] // CHECK: 39:14: UnexposedExpr= Extent=[39:14 - 39:19] // CHECK: 39:14: IntegerLiteral= Extent=[39:14 - 39:19] -// CHECK: 39:23: BinaryOperator= Extent=[39:23 - 39:33] +// CHECK: 39:23: BinaryOperator=<= Extent=[39:23 - 39:33] +// CHECK: 39:23: UnexposedExpr=c:2:14 Extent=[39:23 - 39:24] // CHECK: 39:23: DeclRefExpr=c:2:14 Extent=[39:23 - 39:24] // CHECK: 39:28: UnexposedExpr= Extent=[39:28 - 39:33] // CHECK: 39:28: IntegerLiteral= Extent=[39:28 - 39:33] -// CHECK: 40:9: BinaryOperator= Extent=[40:9 - 40:33] -// CHECK: 40:9: BinaryOperator= Extent=[40:9 - 40:19] +// CHECK: 40:8: ParenExpr= Extent=[40:8 - 40:34] +// CHECK: 40:9: BinaryOperator=&& Extent=[40:9 - 40:33] +// CHECK: 40:9: BinaryOperator=>= Extent=[40:9 - 40:19] +// CHECK: 40:9: UnexposedExpr=c:2:14 Extent=[40:9 - 40:10] // CHECK: 40:9: DeclRefExpr=c:2:14 Extent=[40:9 - 40:10] // CHECK: 40:14: UnexposedExpr= Extent=[40:14 - 40:19] // CHECK: 40:14: IntegerLiteral= Extent=[40:14 - 40:19] -// CHECK: 40:23: BinaryOperator= Extent=[40:23 - 40:33] +// CHECK: 40:23: BinaryOperator=<= Extent=[40:23 - 40:33] +// CHECK: 40:23: UnexposedExpr=c:2:14 Extent=[40:23 - 40:24] // CHECK: 40:23: DeclRefExpr=c:2:14 Extent=[40:23 - 40:24] // CHECK: 40:28: UnexposedExpr= Extent=[40:28 - 40:33] // CHECK: 40:28: IntegerLiteral= Extent=[40:28 - 40:33] -// CHECK: 41:9: BinaryOperator= Extent=[41:9 - 41:33] -// CHECK: 41:9: BinaryOperator= Extent=[41:9 - 41:19] +// CHECK: 41:8: ParenExpr= Extent=[41:8 - 41:34] +// CHECK: 41:9: BinaryOperator=&& Extent=[41:9 - 41:33] +// CHECK: 41:9: BinaryOperator=>= Extent=[41:9 - 41:19] +// CHECK: 41:9: UnexposedExpr=c:2:14 Extent=[41:9 - 41:10] // CHECK: 41:9: DeclRefExpr=c:2:14 Extent=[41:9 - 41:10] // CHECK: 41:14: UnexposedExpr= Extent=[41:14 - 41:19] // CHECK: 41:14: IntegerLiteral= Extent=[41:14 - 41:19] -// CHECK: 41:23: BinaryOperator= Extent=[41:23 - 41:33] +// CHECK: 41:23: BinaryOperator=<= Extent=[41:23 - 41:33] +// CHECK: 41:23: UnexposedExpr=c:2:14 Extent=[41:23 - 41:24] // CHECK: 41:23: DeclRefExpr=c:2:14 Extent=[41:23 - 41:24] // CHECK: 41:28: UnexposedExpr= Extent=[41:28 - 41:33] // CHECK: 41:28: IntegerLiteral= Extent=[41:28 - 41:33] -// CHECK: 42:9: BinaryOperator= Extent=[42:9 - 42:33] -// CHECK: 42:9: BinaryOperator= Extent=[42:9 - 42:19] +// CHECK: 42:8: ParenExpr= Extent=[42:8 - 42:34] +// CHECK: 42:9: BinaryOperator=&& Extent=[42:9 - 42:33] +// CHECK: 42:9: BinaryOperator=>= Extent=[42:9 - 42:19] +// CHECK: 42:9: UnexposedExpr=c:2:14 Extent=[42:9 - 42:10] // CHECK: 42:9: DeclRefExpr=c:2:14 Extent=[42:9 - 42:10] // CHECK: 42:14: UnexposedExpr= Extent=[42:14 - 42:19] // CHECK: 42:14: IntegerLiteral= Extent=[42:14 - 42:19] -// CHECK: 42:23: BinaryOperator= Extent=[42:23 - 42:33] +// CHECK: 42:23: BinaryOperator=<= Extent=[42:23 - 42:33] +// CHECK: 42:23: UnexposedExpr=c:2:14 Extent=[42:23 - 42:24] // CHECK: 42:23: DeclRefExpr=c:2:14 Extent=[42:23 - 42:24] // CHECK: 42:28: UnexposedExpr= Extent=[42:28 - 42:33] // CHECK: 42:28: IntegerLiteral= Extent=[42:28 - 42:33] -// CHECK: 43:9: BinaryOperator= Extent=[43:9 - 43:33] -// CHECK: 43:9: BinaryOperator= Extent=[43:9 - 43:19] +// CHECK: 43:8: ParenExpr= Extent=[43:8 - 43:34] +// CHECK: 43:9: BinaryOperator=&& Extent=[43:9 - 43:33] +// CHECK: 43:9: BinaryOperator=>= Extent=[43:9 - 43:19] +// CHECK: 43:9: UnexposedExpr=c:2:14 Extent=[43:9 - 43:10] // CHECK: 43:9: DeclRefExpr=c:2:14 Extent=[43:9 - 43:10] // CHECK: 43:14: UnexposedExpr= Extent=[43:14 - 43:19] // CHECK: 43:14: IntegerLiteral= Extent=[43:14 - 43:19] -// CHECK: 43:23: BinaryOperator= Extent=[43:23 - 43:33] +// CHECK: 43:23: BinaryOperator=<= Extent=[43:23 - 43:33] +// CHECK: 43:23: UnexposedExpr=c:2:14 Extent=[43:23 - 43:24] // CHECK: 43:23: DeclRefExpr=c:2:14 Extent=[43:23 - 43:24] // CHECK: 43:28: UnexposedExpr= Extent=[43:28 - 43:33] // CHECK: 43:28: IntegerLiteral= Extent=[43:28 - 43:33] -// CHECK: 44:8: BinaryOperator= Extent=[44:8 - 44:18] +// CHECK: 44:8: BinaryOperator=== Extent=[44:8 - 44:18] +// CHECK: 44:8: UnexposedExpr=c:2:14 Extent=[44:8 - 44:9] // CHECK: 44:8: DeclRefExpr=c:2:14 Extent=[44:8 - 44:9] // CHECK: 44:13: UnexposedExpr= Extent=[44:13 - 44:18] // CHECK: 44:13: IntegerLiteral= Extent=[44:13 - 44:18] -// CHECK: 44:23: BinaryOperator= Extent=[44:23 - 44:47] -// CHECK: 44:23: BinaryOperator= Extent=[44:23 - 44:33] +// CHECK: 44:22: ParenExpr= Extent=[44:22 - 44:48] +// CHECK: 44:23: BinaryOperator=&& Extent=[44:23 - 44:47] +// CHECK: 44:23: BinaryOperator=>= Extent=[44:23 - 44:33] +// CHECK: 44:23: UnexposedExpr=c:2:14 Extent=[44:23 - 44:24] // CHECK: 44:23: DeclRefExpr=c:2:14 Extent=[44:23 - 44:24] // CHECK: 44:28: UnexposedExpr= Extent=[44:28 - 44:33] // CHECK: 44:28: IntegerLiteral= Extent=[44:28 - 44:33] -// CHECK: 44:37: BinaryOperator= Extent=[44:37 - 44:47] +// CHECK: 44:37: BinaryOperator=<= Extent=[44:37 - 44:47] +// CHECK: 44:37: UnexposedExpr=c:2:14 Extent=[44:37 - 44:38] // CHECK: 44:37: DeclRefExpr=c:2:14 Extent=[44:37 - 44:38] // CHECK: 44:42: UnexposedExpr= Extent=[44:42 - 44:47] // CHECK: 44:42: IntegerLiteral= Extent=[44:42 - 44:47] -// CHECK: 45:9: BinaryOperator= Extent=[45:9 - 45:33] -// CHECK: 45:9: BinaryOperator= Extent=[45:9 - 45:19] +// CHECK: 45:8: ParenExpr= Extent=[45:8 - 45:34] +// CHECK: 45:9: BinaryOperator=&& Extent=[45:9 - 45:33] +// CHECK: 45:9: BinaryOperator=>= Extent=[45:9 - 45:19] +// CHECK: 45:9: UnexposedExpr=c:2:14 Extent=[45:9 - 45:10] // CHECK: 45:9: DeclRefExpr=c:2:14 Extent=[45:9 - 45:10] // CHECK: 45:14: UnexposedExpr= Extent=[45:14 - 45:19] // CHECK: 45:14: IntegerLiteral= Extent=[45:14 - 45:19] -// CHECK: 45:23: BinaryOperator= Extent=[45:23 - 45:33] +// CHECK: 45:23: BinaryOperator=<= Extent=[45:23 - 45:33] +// CHECK: 45:23: UnexposedExpr=c:2:14 Extent=[45:23 - 45:24] // CHECK: 45:23: DeclRefExpr=c:2:14 Extent=[45:23 - 45:24] // CHECK: 45:28: UnexposedExpr= Extent=[45:28 - 45:33] // CHECK: 45:28: IntegerLiteral= Extent=[45:28 - 45:33] -// CHECK: 46:8: BinaryOperator= Extent=[46:8 - 46:18] +// CHECK: 46:8: BinaryOperator=== Extent=[46:8 - 46:18] +// CHECK: 46:8: UnexposedExpr=c:2:14 Extent=[46:8 - 46:9] // CHECK: 46:8: DeclRefExpr=c:2:14 Extent=[46:8 - 46:9] // CHECK: 46:13: UnexposedExpr= Extent=[46:13 - 46:18] // CHECK: 46:13: IntegerLiteral= Extent=[46:13 - 46:18] -// CHECK: 46:23: BinaryOperator= Extent=[46:23 - 46:47] -// CHECK: 46:23: BinaryOperator= Extent=[46:23 - 46:33] +// CHECK: 46:22: ParenExpr= Extent=[46:22 - 46:48] +// CHECK: 46:23: BinaryOperator=&& Extent=[46:23 - 46:47] +// CHECK: 46:23: BinaryOperator=>= Extent=[46:23 - 46:33] +// CHECK: 46:23: UnexposedExpr=c:2:14 Extent=[46:23 - 46:24] // CHECK: 46:23: DeclRefExpr=c:2:14 Extent=[46:23 - 46:24] // CHECK: 46:28: UnexposedExpr= Extent=[46:28 - 46:33] // CHECK: 46:28: IntegerLiteral= Extent=[46:28 - 46:33] -// CHECK: 46:37: BinaryOperator= Extent=[46:37 - 46:47] +// CHECK: 46:37: BinaryOperator=<= Extent=[46:37 - 46:47] +// CHECK: 46:37: UnexposedExpr=c:2:14 Extent=[46:37 - 46:38] // CHECK: 46:37: DeclRefExpr=c:2:14 Extent=[46:37 - 46:38] // CHECK: 46:42: UnexposedExpr= Extent=[46:42 - 46:47] // CHECK: 46:42: IntegerLiteral= Extent=[46:42 - 46:47] -// CHECK: 47:9: BinaryOperator= Extent=[47:9 - 47:33] -// CHECK: 47:9: BinaryOperator= Extent=[47:9 - 47:19] +// CHECK: 47:8: ParenExpr= Extent=[47:8 - 47:34] +// CHECK: 47:9: BinaryOperator=&& Extent=[47:9 - 47:33] +// CHECK: 47:9: BinaryOperator=>= Extent=[47:9 - 47:19] +// CHECK: 47:9: UnexposedExpr=c:2:14 Extent=[47:9 - 47:10] // CHECK: 47:9: DeclRefExpr=c:2:14 Extent=[47:9 - 47:10] // CHECK: 47:14: UnexposedExpr= Extent=[47:14 - 47:19] // CHECK: 47:14: IntegerLiteral= Extent=[47:14 - 47:19] -// CHECK: 47:23: BinaryOperator= Extent=[47:23 - 47:33] +// CHECK: 47:23: BinaryOperator=<= Extent=[47:23 - 47:33] +// CHECK: 47:23: UnexposedExpr=c:2:14 Extent=[47:23 - 47:24] // CHECK: 47:23: DeclRefExpr=c:2:14 Extent=[47:23 - 47:24] // CHECK: 47:28: UnexposedExpr= Extent=[47:28 - 47:33] // CHECK: 47:28: IntegerLiteral= Extent=[47:28 - 47:33] -// CHECK: 48:9: BinaryOperator= Extent=[48:9 - 48:33] -// CHECK: 48:9: BinaryOperator= Extent=[48:9 - 48:19] +// CHECK: 48:8: ParenExpr= Extent=[48:8 - 48:34] +// CHECK: 48:9: BinaryOperator=&& Extent=[48:9 - 48:33] +// CHECK: 48:9: BinaryOperator=>= Extent=[48:9 - 48:19] +// CHECK: 48:9: UnexposedExpr=c:2:14 Extent=[48:9 - 48:10] // CHECK: 48:9: DeclRefExpr=c:2:14 Extent=[48:9 - 48:10] // CHECK: 48:14: UnexposedExpr= Extent=[48:14 - 48:19] // CHECK: 48:14: IntegerLiteral= Extent=[48:14 - 48:19] -// CHECK: 48:23: BinaryOperator= Extent=[48:23 - 48:33] +// CHECK: 48:23: BinaryOperator=<= Extent=[48:23 - 48:33] +// CHECK: 48:23: UnexposedExpr=c:2:14 Extent=[48:23 - 48:24] // CHECK: 48:23: DeclRefExpr=c:2:14 Extent=[48:23 - 48:24] // CHECK: 48:28: UnexposedExpr= Extent=[48:28 - 48:33] // CHECK: 48:28: IntegerLiteral= Extent=[48:28 - 48:33] -// CHECK: 49:9: BinaryOperator= Extent=[49:9 - 49:33] -// CHECK: 49:9: BinaryOperator= Extent=[49:9 - 49:19] +// CHECK: 49:8: ParenExpr= Extent=[49:8 - 49:34] +// CHECK: 49:9: BinaryOperator=&& Extent=[49:9 - 49:33] +// CHECK: 49:9: BinaryOperator=>= Extent=[49:9 - 49:19] +// CHECK: 49:9: UnexposedExpr=c:2:14 Extent=[49:9 - 49:10] // CHECK: 49:9: DeclRefExpr=c:2:14 Extent=[49:9 - 49:10] // CHECK: 49:14: UnexposedExpr= Extent=[49:14 - 49:19] // CHECK: 49:14: IntegerLiteral= Extent=[49:14 - 49:19] -// CHECK: 49:23: BinaryOperator= Extent=[49:23 - 49:33] +// CHECK: 49:23: BinaryOperator=<= Extent=[49:23 - 49:33] +// CHECK: 49:23: UnexposedExpr=c:2:14 Extent=[49:23 - 49:24] // CHECK: 49:23: DeclRefExpr=c:2:14 Extent=[49:23 - 49:24] // CHECK: 49:28: UnexposedExpr= Extent=[49:28 - 49:33] // CHECK: 49:28: IntegerLiteral= Extent=[49:28 - 49:33] -// CHECK: 50:9: BinaryOperator= Extent=[50:9 - 50:33] -// CHECK: 50:9: BinaryOperator= Extent=[50:9 - 50:19] +// CHECK: 50:8: ParenExpr= Extent=[50:8 - 50:34] +// CHECK: 50:9: BinaryOperator=&& Extent=[50:9 - 50:33] +// CHECK: 50:9: BinaryOperator=>= Extent=[50:9 - 50:19] +// CHECK: 50:9: UnexposedExpr=c:2:14 Extent=[50:9 - 50:10] // CHECK: 50:9: DeclRefExpr=c:2:14 Extent=[50:9 - 50:10] // CHECK: 50:14: UnexposedExpr= Extent=[50:14 - 50:19] // CHECK: 50:14: IntegerLiteral= Extent=[50:14 - 50:19] -// CHECK: 50:23: BinaryOperator= Extent=[50:23 - 50:33] +// CHECK: 50:23: BinaryOperator=<= Extent=[50:23 - 50:33] +// CHECK: 50:23: UnexposedExpr=c:2:14 Extent=[50:23 - 50:24] // CHECK: 50:23: DeclRefExpr=c:2:14 Extent=[50:23 - 50:24] // CHECK: 50:28: UnexposedExpr= Extent=[50:28 - 50:33] // CHECK: 50:28: IntegerLiteral= Extent=[50:28 - 50:33] -// CHECK: 51:8: BinaryOperator= Extent=[51:8 - 51:18] +// CHECK: 51:8: BinaryOperator=== Extent=[51:8 - 51:18] +// CHECK: 51:8: UnexposedExpr=c:2:14 Extent=[51:8 - 51:9] // CHECK: 51:8: DeclRefExpr=c:2:14 Extent=[51:8 - 51:9] // CHECK: 51:13: UnexposedExpr= Extent=[51:13 - 51:18] // CHECK: 51:13: IntegerLiteral= Extent=[51:13 - 51:18] -// CHECK: 51:23: BinaryOperator= Extent=[51:23 - 51:47] -// CHECK: 51:23: BinaryOperator= Extent=[51:23 - 51:33] +// CHECK: 51:22: ParenExpr= Extent=[51:22 - 51:48] +// CHECK: 51:23: BinaryOperator=&& Extent=[51:23 - 51:47] +// CHECK: 51:23: BinaryOperator=>= Extent=[51:23 - 51:33] +// CHECK: 51:23: UnexposedExpr=c:2:14 Extent=[51:23 - 51:24] // CHECK: 51:23: DeclRefExpr=c:2:14 Extent=[51:23 - 51:24] // CHECK: 51:28: UnexposedExpr= Extent=[51:28 - 51:33] // CHECK: 51:28: IntegerLiteral= Extent=[51:28 - 51:33] -// CHECK: 51:37: BinaryOperator= Extent=[51:37 - 51:47] +// CHECK: 51:37: BinaryOperator=<= Extent=[51:37 - 51:47] +// CHECK: 51:37: UnexposedExpr=c:2:14 Extent=[51:37 - 51:38] // CHECK: 51:37: DeclRefExpr=c:2:14 Extent=[51:37 - 51:38] // CHECK: 51:42: UnexposedExpr= Extent=[51:42 - 51:47] // CHECK: 51:42: IntegerLiteral= Extent=[51:42 - 51:47] -// CHECK: 52:9: BinaryOperator= Extent=[52:9 - 52:33] -// CHECK: 52:9: BinaryOperator= Extent=[52:9 - 52:19] +// CHECK: 52:8: ParenExpr= Extent=[52:8 - 52:34] +// CHECK: 52:9: BinaryOperator=&& Extent=[52:9 - 52:33] +// CHECK: 52:9: BinaryOperator=>= Extent=[52:9 - 52:19] +// CHECK: 52:9: UnexposedExpr=c:2:14 Extent=[52:9 - 52:10] // CHECK: 52:9: DeclRefExpr=c:2:14 Extent=[52:9 - 52:10] // CHECK: 52:14: UnexposedExpr= Extent=[52:14 - 52:19] // CHECK: 52:14: IntegerLiteral= Extent=[52:14 - 52:19] -// CHECK: 52:23: BinaryOperator= Extent=[52:23 - 52:33] +// CHECK: 52:23: BinaryOperator=<= Extent=[52:23 - 52:33] +// CHECK: 52:23: UnexposedExpr=c:2:14 Extent=[52:23 - 52:24] // CHECK: 52:23: DeclRefExpr=c:2:14 Extent=[52:23 - 52:24] // CHECK: 52:28: UnexposedExpr= Extent=[52:28 - 52:33] // CHECK: 52:28: IntegerLiteral= Extent=[52:28 - 52:33] -// CHECK: 53:9: BinaryOperator= Extent=[53:9 - 53:33] -// CHECK: 53:9: BinaryOperator= Extent=[53:9 - 53:19] +// CHECK: 53:8: ParenExpr= Extent=[53:8 - 53:34] +// CHECK: 53:9: BinaryOperator=&& Extent=[53:9 - 53:33] +// CHECK: 53:9: BinaryOperator=>= Extent=[53:9 - 53:19] +// CHECK: 53:9: UnexposedExpr=c:2:14 Extent=[53:9 - 53:10] // CHECK: 53:9: DeclRefExpr=c:2:14 Extent=[53:9 - 53:10] // CHECK: 53:14: UnexposedExpr= Extent=[53:14 - 53:19] // CHECK: 53:14: IntegerLiteral= Extent=[53:14 - 53:19] -// CHECK: 53:23: BinaryOperator= Extent=[53:23 - 53:33] +// CHECK: 53:23: BinaryOperator=<= Extent=[53:23 - 53:33] +// CHECK: 53:23: UnexposedExpr=c:2:14 Extent=[53:23 - 53:24] // CHECK: 53:23: DeclRefExpr=c:2:14 Extent=[53:23 - 53:24] // CHECK: 53:28: UnexposedExpr= Extent=[53:28 - 53:33] // CHECK: 53:28: IntegerLiteral= Extent=[53:28 - 53:33] -// CHECK: 54:9: BinaryOperator= Extent=[54:9 - 54:33] -// CHECK: 54:9: BinaryOperator= Extent=[54:9 - 54:19] +// CHECK: 54:8: ParenExpr= Extent=[54:8 - 54:34] +// CHECK: 54:9: BinaryOperator=&& Extent=[54:9 - 54:33] +// CHECK: 54:9: BinaryOperator=>= Extent=[54:9 - 54:19] +// CHECK: 54:9: UnexposedExpr=c:2:14 Extent=[54:9 - 54:10] // CHECK: 54:9: DeclRefExpr=c:2:14 Extent=[54:9 - 54:10] // CHECK: 54:14: UnexposedExpr= Extent=[54:14 - 54:19] // CHECK: 54:14: IntegerLiteral= Extent=[54:14 - 54:19] -// CHECK: 54:23: BinaryOperator= Extent=[54:23 - 54:33] +// CHECK: 54:23: BinaryOperator=<= Extent=[54:23 - 54:33] +// CHECK: 54:23: UnexposedExpr=c:2:14 Extent=[54:23 - 54:24] // CHECK: 54:23: DeclRefExpr=c:2:14 Extent=[54:23 - 54:24] // CHECK: 54:28: UnexposedExpr= Extent=[54:28 - 54:33] // CHECK: 54:28: IntegerLiteral= Extent=[54:28 - 54:33] -// CHECK: 55:9: BinaryOperator= Extent=[55:9 - 55:33] -// CHECK: 55:9: BinaryOperator= Extent=[55:9 - 55:19] +// CHECK: 55:8: ParenExpr= Extent=[55:8 - 55:34] +// CHECK: 55:9: BinaryOperator=&& Extent=[55:9 - 55:33] +// CHECK: 55:9: BinaryOperator=>= Extent=[55:9 - 55:19] +// CHECK: 55:9: UnexposedExpr=c:2:14 Extent=[55:9 - 55:10] // CHECK: 55:9: DeclRefExpr=c:2:14 Extent=[55:9 - 55:10] // CHECK: 55:14: UnexposedExpr= Extent=[55:14 - 55:19] // CHECK: 55:14: IntegerLiteral= Extent=[55:14 - 55:19] -// CHECK: 55:23: BinaryOperator= Extent=[55:23 - 55:33] +// CHECK: 55:23: BinaryOperator=<= Extent=[55:23 - 55:33] +// CHECK: 55:23: UnexposedExpr=c:2:14 Extent=[55:23 - 55:24] // CHECK: 55:23: DeclRefExpr=c:2:14 Extent=[55:23 - 55:24] // CHECK: 55:28: UnexposedExpr= Extent=[55:28 - 55:33] // CHECK: 55:28: IntegerLiteral= Extent=[55:28 - 55:33] -// CHECK: 56:9: BinaryOperator= Extent=[56:9 - 56:33] -// CHECK: 56:9: BinaryOperator= Extent=[56:9 - 56:19] +// CHECK: 56:8: ParenExpr= Extent=[56:8 - 56:34] +// CHECK: 56:9: BinaryOperator=&& Extent=[56:9 - 56:33] +// CHECK: 56:9: BinaryOperator=>= Extent=[56:9 - 56:19] +// CHECK: 56:9: UnexposedExpr=c:2:14 Extent=[56:9 - 56:10] // CHECK: 56:9: DeclRefExpr=c:2:14 Extent=[56:9 - 56:10] // CHECK: 56:14: UnexposedExpr= Extent=[56:14 - 56:19] // CHECK: 56:14: IntegerLiteral= Extent=[56:14 - 56:19] -// CHECK: 56:23: BinaryOperator= Extent=[56:23 - 56:33] +// CHECK: 56:23: BinaryOperator=<= Extent=[56:23 - 56:33] +// CHECK: 56:23: UnexposedExpr=c:2:14 Extent=[56:23 - 56:24] // CHECK: 56:23: DeclRefExpr=c:2:14 Extent=[56:23 - 56:24] // CHECK: 56:28: UnexposedExpr= Extent=[56:28 - 56:33] // CHECK: 56:28: IntegerLiteral= Extent=[56:28 - 56:33] -// CHECK: 57:9: BinaryOperator= Extent=[57:9 - 57:33] -// CHECK: 57:9: BinaryOperator= Extent=[57:9 - 57:19] +// CHECK: 57:8: ParenExpr= Extent=[57:8 - 57:34] +// CHECK: 57:9: BinaryOperator=&& Extent=[57:9 - 57:33] +// CHECK: 57:9: BinaryOperator=>= Extent=[57:9 - 57:19] +// CHECK: 57:9: UnexposedExpr=c:2:14 Extent=[57:9 - 57:10] // CHECK: 57:9: DeclRefExpr=c:2:14 Extent=[57:9 - 57:10] // CHECK: 57:14: UnexposedExpr= Extent=[57:14 - 57:19] // CHECK: 57:14: IntegerLiteral= Extent=[57:14 - 57:19] -// CHECK: 57:23: BinaryOperator= Extent=[57:23 - 57:33] +// CHECK: 57:23: BinaryOperator=<= Extent=[57:23 - 57:33] +// CHECK: 57:23: UnexposedExpr=c:2:14 Extent=[57:23 - 57:24] // CHECK: 57:23: DeclRefExpr=c:2:14 Extent=[57:23 - 57:24] // CHECK: 57:28: UnexposedExpr= Extent=[57:28 - 57:33] // CHECK: 57:28: IntegerLiteral= Extent=[57:28 - 57:33] -// CHECK: 58:9: BinaryOperator= Extent=[58:9 - 58:33] -// CHECK: 58:9: BinaryOperator= Extent=[58:9 - 58:19] +// CHECK: 58:8: ParenExpr= Extent=[58:8 - 58:34] +// CHECK: 58:9: BinaryOperator=&& Extent=[58:9 - 58:33] +// CHECK: 58:9: BinaryOperator=>= Extent=[58:9 - 58:19] +// CHECK: 58:9: UnexposedExpr=c:2:14 Extent=[58:9 - 58:10] // CHECK: 58:9: DeclRefExpr=c:2:14 Extent=[58:9 - 58:10] // CHECK: 58:14: UnexposedExpr= Extent=[58:14 - 58:19] // CHECK: 58:14: IntegerLiteral= Extent=[58:14 - 58:19] -// CHECK: 58:23: BinaryOperator= Extent=[58:23 - 58:33] +// CHECK: 58:23: BinaryOperator=<= Extent=[58:23 - 58:33] +// CHECK: 58:23: UnexposedExpr=c:2:14 Extent=[58:23 - 58:24] // CHECK: 58:23: DeclRefExpr=c:2:14 Extent=[58:23 - 58:24] // CHECK: 58:28: UnexposedExpr= Extent=[58:28 - 58:33] // CHECK: 58:28: IntegerLiteral= Extent=[58:28 - 58:33] -// CHECK: 59:9: BinaryOperator= Extent=[59:9 - 59:33] -// CHECK: 59:9: BinaryOperator= Extent=[59:9 - 59:19] +// CHECK: 59:8: ParenExpr= Extent=[59:8 - 59:34] +// CHECK: 59:9: BinaryOperator=&& Extent=[59:9 - 59:33] +// CHECK: 59:9: BinaryOperator=>= Extent=[59:9 - 59:19] +// CHECK: 59:9: UnexposedExpr=c:2:14 Extent=[59:9 - 59:10] // CHECK: 59:9: DeclRefExpr=c:2:14 Extent=[59:9 - 59:10] // CHECK: 59:14: UnexposedExpr= Extent=[59:14 - 59:19] // CHECK: 59:14: IntegerLiteral= Extent=[59:14 - 59:19] -// CHECK: 59:23: BinaryOperator= Extent=[59:23 - 59:33] +// CHECK: 59:23: BinaryOperator=<= Extent=[59:23 - 59:33] +// CHECK: 59:23: UnexposedExpr=c:2:14 Extent=[59:23 - 59:24] // CHECK: 59:23: DeclRefExpr=c:2:14 Extent=[59:23 - 59:24] // CHECK: 59:28: UnexposedExpr= Extent=[59:28 - 59:33] // CHECK: 59:28: IntegerLiteral= Extent=[59:28 - 59:33] -// CHECK: 60:9: BinaryOperator= Extent=[60:9 - 60:33] -// CHECK: 60:9: BinaryOperator= Extent=[60:9 - 60:19] +// CHECK: 60:8: ParenExpr= Extent=[60:8 - 60:34] +// CHECK: 60:9: BinaryOperator=&& Extent=[60:9 - 60:33] +// CHECK: 60:9: BinaryOperator=>= Extent=[60:9 - 60:19] +// CHECK: 60:9: UnexposedExpr=c:2:14 Extent=[60:9 - 60:10] // CHECK: 60:9: DeclRefExpr=c:2:14 Extent=[60:9 - 60:10] // CHECK: 60:14: UnexposedExpr= Extent=[60:14 - 60:19] // CHECK: 60:14: IntegerLiteral= Extent=[60:14 - 60:19] -// CHECK: 60:23: BinaryOperator= Extent=[60:23 - 60:33] +// CHECK: 60:23: BinaryOperator=<= Extent=[60:23 - 60:33] +// CHECK: 60:23: UnexposedExpr=c:2:14 Extent=[60:23 - 60:24] // CHECK: 60:23: DeclRefExpr=c:2:14 Extent=[60:23 - 60:24] // CHECK: 60:28: UnexposedExpr= Extent=[60:28 - 60:33] // CHECK: 60:28: IntegerLiteral= Extent=[60:28 - 60:33] -// CHECK: 61:9: BinaryOperator= Extent=[61:9 - 61:33] -// CHECK: 61:9: BinaryOperator= Extent=[61:9 - 61:19] +// CHECK: 61:8: ParenExpr= Extent=[61:8 - 61:34] +// CHECK: 61:9: BinaryOperator=&& Extent=[61:9 - 61:33] +// CHECK: 61:9: BinaryOperator=>= Extent=[61:9 - 61:19] +// CHECK: 61:9: UnexposedExpr=c:2:14 Extent=[61:9 - 61:10] // CHECK: 61:9: DeclRefExpr=c:2:14 Extent=[61:9 - 61:10] // CHECK: 61:14: UnexposedExpr= Extent=[61:14 - 61:19] // CHECK: 61:14: IntegerLiteral= Extent=[61:14 - 61:19] -// CHECK: 61:23: BinaryOperator= Extent=[61:23 - 61:33] +// CHECK: 61:23: BinaryOperator=<= Extent=[61:23 - 61:33] +// CHECK: 61:23: UnexposedExpr=c:2:14 Extent=[61:23 - 61:24] // CHECK: 61:23: DeclRefExpr=c:2:14 Extent=[61:23 - 61:24] // CHECK: 61:28: UnexposedExpr= Extent=[61:28 - 61:33] // CHECK: 61:28: IntegerLiteral= Extent=[61:28 - 61:33] -// CHECK: 62:9: BinaryOperator= Extent=[62:9 - 62:33] -// CHECK: 62:9: BinaryOperator= Extent=[62:9 - 62:19] +// CHECK: 62:8: ParenExpr= Extent=[62:8 - 62:34] +// CHECK: 62:9: BinaryOperator=&& Extent=[62:9 - 62:33] +// CHECK: 62:9: BinaryOperator=>= Extent=[62:9 - 62:19] +// CHECK: 62:9: UnexposedExpr=c:2:14 Extent=[62:9 - 62:10] // CHECK: 62:9: DeclRefExpr=c:2:14 Extent=[62:9 - 62:10] // CHECK: 62:14: UnexposedExpr= Extent=[62:14 - 62:19] // CHECK: 62:14: IntegerLiteral= Extent=[62:14 - 62:19] -// CHECK: 62:23: BinaryOperator= Extent=[62:23 - 62:33] +// CHECK: 62:23: BinaryOperator=<= Extent=[62:23 - 62:33] +// CHECK: 62:23: UnexposedExpr=c:2:14 Extent=[62:23 - 62:24] // CHECK: 62:23: DeclRefExpr=c:2:14 Extent=[62:23 - 62:24] // CHECK: 62:28: UnexposedExpr= Extent=[62:28 - 62:33] // CHECK: 62:28: IntegerLiteral= Extent=[62:28 - 62:33] -// CHECK: 63:8: BinaryOperator= Extent=[63:8 - 63:18] +// CHECK: 63:8: BinaryOperator=== Extent=[63:8 - 63:18] +// CHECK: 63:8: UnexposedExpr=c:2:14 Extent=[63:8 - 63:9] // CHECK: 63:8: DeclRefExpr=c:2:14 Extent=[63:8 - 63:9] // CHECK: 63:13: UnexposedExpr= Extent=[63:13 - 63:18] // CHECK: 63:13: IntegerLiteral= Extent=[63:13 - 63:18] -// CHECK: 63:23: BinaryOperator= Extent=[63:23 - 63:47] -// CHECK: 63:23: BinaryOperator= Extent=[63:23 - 63:33] +// CHECK: 63:22: ParenExpr= Extent=[63:22 - 63:48] +// CHECK: 63:23: BinaryOperator=&& Extent=[63:23 - 63:47] +// CHECK: 63:23: BinaryOperator=>= Extent=[63:23 - 63:33] +// CHECK: 63:23: UnexposedExpr=c:2:14 Extent=[63:23 - 63:24] // CHECK: 63:23: DeclRefExpr=c:2:14 Extent=[63:23 - 63:24] // CHECK: 63:28: UnexposedExpr= Extent=[63:28 - 63:33] // CHECK: 63:28: IntegerLiteral= Extent=[63:28 - 63:33] -// CHECK: 63:37: BinaryOperator= Extent=[63:37 - 63:47] +// CHECK: 63:37: BinaryOperator=<= Extent=[63:37 - 63:47] +// CHECK: 63:37: UnexposedExpr=c:2:14 Extent=[63:37 - 63:38] // CHECK: 63:37: DeclRefExpr=c:2:14 Extent=[63:37 - 63:38] // CHECK: 63:42: UnexposedExpr= Extent=[63:42 - 63:47] // CHECK: 63:42: IntegerLiteral= Extent=[63:42 - 63:47] -// CHECK: 64:9: BinaryOperator= Extent=[64:9 - 64:33] -// CHECK: 64:9: BinaryOperator= Extent=[64:9 - 64:19] +// CHECK: 64:8: ParenExpr= Extent=[64:8 - 64:34] +// CHECK: 64:9: BinaryOperator=&& Extent=[64:9 - 64:33] +// CHECK: 64:9: BinaryOperator=>= Extent=[64:9 - 64:19] +// CHECK: 64:9: UnexposedExpr=c:2:14 Extent=[64:9 - 64:10] // CHECK: 64:9: DeclRefExpr=c:2:14 Extent=[64:9 - 64:10] // CHECK: 64:14: UnexposedExpr= Extent=[64:14 - 64:19] // CHECK: 64:14: IntegerLiteral= Extent=[64:14 - 64:19] -// CHECK: 64:23: BinaryOperator= Extent=[64:23 - 64:33] +// CHECK: 64:23: BinaryOperator=<= Extent=[64:23 - 64:33] +// CHECK: 64:23: UnexposedExpr=c:2:14 Extent=[64:23 - 64:24] // CHECK: 64:23: DeclRefExpr=c:2:14 Extent=[64:23 - 64:24] // CHECK: 64:28: UnexposedExpr= Extent=[64:28 - 64:33] // CHECK: 64:28: IntegerLiteral= Extent=[64:28 - 64:33] -// CHECK: 65:8: BinaryOperator= Extent=[65:8 - 65:18] +// CHECK: 65:8: BinaryOperator=== Extent=[65:8 - 65:18] +// CHECK: 65:8: UnexposedExpr=c:2:14 Extent=[65:8 - 65:9] // CHECK: 65:8: DeclRefExpr=c:2:14 Extent=[65:8 - 65:9] // CHECK: 65:13: UnexposedExpr= Extent=[65:13 - 65:18] // CHECK: 65:13: IntegerLiteral= Extent=[65:13 - 65:18] -// CHECK: 65:23: BinaryOperator= Extent=[65:23 - 65:47] -// CHECK: 65:23: BinaryOperator= Extent=[65:23 - 65:33] +// CHECK: 65:22: ParenExpr= Extent=[65:22 - 65:48] +// CHECK: 65:23: BinaryOperator=&& Extent=[65:23 - 65:47] +// CHECK: 65:23: BinaryOperator=>= Extent=[65:23 - 65:33] +// CHECK: 65:23: UnexposedExpr=c:2:14 Extent=[65:23 - 65:24] // CHECK: 65:23: DeclRefExpr=c:2:14 Extent=[65:23 - 65:24] // CHECK: 65:28: UnexposedExpr= Extent=[65:28 - 65:33] // CHECK: 65:28: IntegerLiteral= Extent=[65:28 - 65:33] -// CHECK: 65:37: BinaryOperator= Extent=[65:37 - 65:47] +// CHECK: 65:37: BinaryOperator=<= Extent=[65:37 - 65:47] +// CHECK: 65:37: UnexposedExpr=c:2:14 Extent=[65:37 - 65:38] // CHECK: 65:37: DeclRefExpr=c:2:14 Extent=[65:37 - 65:38] // CHECK: 65:42: UnexposedExpr= Extent=[65:42 - 65:47] // CHECK: 65:42: IntegerLiteral= Extent=[65:42 - 65:47] -// CHECK: 66:9: BinaryOperator= Extent=[66:9 - 66:33] -// CHECK: 66:9: BinaryOperator= Extent=[66:9 - 66:19] +// CHECK: 66:8: ParenExpr= Extent=[66:8 - 66:34] +// CHECK: 66:9: BinaryOperator=&& Extent=[66:9 - 66:33] +// CHECK: 66:9: BinaryOperator=>= Extent=[66:9 - 66:19] +// CHECK: 66:9: UnexposedExpr=c:2:14 Extent=[66:9 - 66:10] // CHECK: 66:9: DeclRefExpr=c:2:14 Extent=[66:9 - 66:10] // CHECK: 66:14: UnexposedExpr= Extent=[66:14 - 66:19] // CHECK: 66:14: IntegerLiteral= Extent=[66:14 - 66:19] -// CHECK: 66:23: BinaryOperator= Extent=[66:23 - 66:33] +// CHECK: 66:23: BinaryOperator=<= Extent=[66:23 - 66:33] +// CHECK: 66:23: UnexposedExpr=c:2:14 Extent=[66:23 - 66:24] // CHECK: 66:23: DeclRefExpr=c:2:14 Extent=[66:23 - 66:24] // CHECK: 66:28: UnexposedExpr= Extent=[66:28 - 66:33] // CHECK: 66:28: IntegerLiteral= Extent=[66:28 - 66:33] -// CHECK: 67:9: BinaryOperator= Extent=[67:9 - 67:33] -// CHECK: 67:9: BinaryOperator= Extent=[67:9 - 67:19] +// CHECK: 67:8: ParenExpr= Extent=[67:8 - 67:34] +// CHECK: 67:9: BinaryOperator=&& Extent=[67:9 - 67:33] +// CHECK: 67:9: BinaryOperator=>= Extent=[67:9 - 67:19] +// CHECK: 67:9: UnexposedExpr=c:2:14 Extent=[67:9 - 67:10] // CHECK: 67:9: DeclRefExpr=c:2:14 Extent=[67:9 - 67:10] // CHECK: 67:14: UnexposedExpr= Extent=[67:14 - 67:19] // CHECK: 67:14: IntegerLiteral= Extent=[67:14 - 67:19] -// CHECK: 67:23: BinaryOperator= Extent=[67:23 - 67:33] +// CHECK: 67:23: BinaryOperator=<= Extent=[67:23 - 67:33] +// CHECK: 67:23: UnexposedExpr=c:2:14 Extent=[67:23 - 67:24] // CHECK: 67:23: DeclRefExpr=c:2:14 Extent=[67:23 - 67:24] // CHECK: 67:28: UnexposedExpr= Extent=[67:28 - 67:33] // CHECK: 67:28: IntegerLiteral= Extent=[67:28 - 67:33] -// CHECK: 68:9: BinaryOperator= Extent=[68:9 - 68:33] -// CHECK: 68:9: BinaryOperator= Extent=[68:9 - 68:19] +// CHECK: 68:8: ParenExpr= Extent=[68:8 - 68:34] +// CHECK: 68:9: BinaryOperator=&& Extent=[68:9 - 68:33] +// CHECK: 68:9: BinaryOperator=>= Extent=[68:9 - 68:19] +// CHECK: 68:9: UnexposedExpr=c:2:14 Extent=[68:9 - 68:10] // CHECK: 68:9: DeclRefExpr=c:2:14 Extent=[68:9 - 68:10] // CHECK: 68:14: UnexposedExpr= Extent=[68:14 - 68:19] // CHECK: 68:14: IntegerLiteral= Extent=[68:14 - 68:19] -// CHECK: 68:23: BinaryOperator= Extent=[68:23 - 68:33] +// CHECK: 68:23: BinaryOperator=<= Extent=[68:23 - 68:33] +// CHECK: 68:23: UnexposedExpr=c:2:14 Extent=[68:23 - 68:24] // CHECK: 68:23: DeclRefExpr=c:2:14 Extent=[68:23 - 68:24] // CHECK: 68:28: UnexposedExpr= Extent=[68:28 - 68:33] // CHECK: 68:28: IntegerLiteral= Extent=[68:28 - 68:33] -// CHECK: 69:9: BinaryOperator= Extent=[69:9 - 69:33] -// CHECK: 69:9: BinaryOperator= Extent=[69:9 - 69:19] +// CHECK: 69:8: ParenExpr= Extent=[69:8 - 69:34] +// CHECK: 69:9: BinaryOperator=&& Extent=[69:9 - 69:33] +// CHECK: 69:9: BinaryOperator=>= Extent=[69:9 - 69:19] +// CHECK: 69:9: UnexposedExpr=c:2:14 Extent=[69:9 - 69:10] // CHECK: 69:9: DeclRefExpr=c:2:14 Extent=[69:9 - 69:10] // CHECK: 69:14: UnexposedExpr= Extent=[69:14 - 69:19] // CHECK: 69:14: IntegerLiteral= Extent=[69:14 - 69:19] -// CHECK: 69:23: BinaryOperator= Extent=[69:23 - 69:33] +// CHECK: 69:23: BinaryOperator=<= Extent=[69:23 - 69:33] +// CHECK: 69:23: UnexposedExpr=c:2:14 Extent=[69:23 - 69:24] // CHECK: 69:23: DeclRefExpr=c:2:14 Extent=[69:23 - 69:24] // CHECK: 69:28: UnexposedExpr= Extent=[69:28 - 69:33] // CHECK: 69:28: IntegerLiteral= Extent=[69:28 - 69:33] -// CHECK: 70:8: BinaryOperator= Extent=[70:8 - 70:18] +// CHECK: 70:8: BinaryOperator=== Extent=[70:8 - 70:18] +// CHECK: 70:8: UnexposedExpr=c:2:14 Extent=[70:8 - 70:9] // CHECK: 70:8: DeclRefExpr=c:2:14 Extent=[70:8 - 70:9] // CHECK: 70:13: UnexposedExpr= Extent=[70:13 - 70:18] // CHECK: 70:13: IntegerLiteral= Extent=[70:13 - 70:18] -// CHECK: 70:22: BinaryOperator= Extent=[70:22 - 70:32] +// CHECK: 70:22: BinaryOperator=== Extent=[70:22 - 70:32] +// CHECK: 70:22: UnexposedExpr=c:2:14 Extent=[70:22 - 70:23] // CHECK: 70:22: DeclRefExpr=c:2:14 Extent=[70:22 - 70:23] // CHECK: 70:27: UnexposedExpr= Extent=[70:27 - 70:32] // CHECK: 70:27: IntegerLiteral= Extent=[70:27 - 70:32] -// CHECK: 70:37: BinaryOperator= Extent=[70:37 - 70:61] -// CHECK: 70:37: BinaryOperator= Extent=[70:37 - 70:47] +// CHECK: 70:36: ParenExpr= Extent=[70:36 - 70:62] +// CHECK: 70:37: BinaryOperator=&& Extent=[70:37 - 70:61] +// CHECK: 70:37: BinaryOperator=>= Extent=[70:37 - 70:47] +// CHECK: 70:37: UnexposedExpr=c:2:14 Extent=[70:37 - 70:38] // CHECK: 70:37: DeclRefExpr=c:2:14 Extent=[70:37 - 70:38] // CHECK: 70:42: UnexposedExpr= Extent=[70:42 - 70:47] // CHECK: 70:42: IntegerLiteral= Extent=[70:42 - 70:47] -// CHECK: 70:51: BinaryOperator= Extent=[70:51 - 70:61] +// CHECK: 70:51: BinaryOperator=<= Extent=[70:51 - 70:61] +// CHECK: 70:51: UnexposedExpr=c:2:14 Extent=[70:51 - 70:52] // CHECK: 70:51: DeclRefExpr=c:2:14 Extent=[70:51 - 70:52] // CHECK: 70:56: UnexposedExpr= Extent=[70:56 - 70:61] // CHECK: 70:56: IntegerLiteral= Extent=[70:56 - 70:61] -// CHECK: 71:9: BinaryOperator= Extent=[71:9 - 71:33] -// CHECK: 71:9: BinaryOperator= Extent=[71:9 - 71:19] +// CHECK: 71:8: ParenExpr= Extent=[71:8 - 71:34] +// CHECK: 71:9: BinaryOperator=&& Extent=[71:9 - 71:33] +// CHECK: 71:9: BinaryOperator=>= Extent=[71:9 - 71:19] +// CHECK: 71:9: UnexposedExpr=c:2:14 Extent=[71:9 - 71:10] // CHECK: 71:9: DeclRefExpr=c:2:14 Extent=[71:9 - 71:10] // CHECK: 71:14: UnexposedExpr= Extent=[71:14 - 71:19] // CHECK: 71:14: IntegerLiteral= Extent=[71:14 - 71:19] -// CHECK: 71:23: BinaryOperator= Extent=[71:23 - 71:33] +// CHECK: 71:23: BinaryOperator=<= Extent=[71:23 - 71:33] +// CHECK: 71:23: UnexposedExpr=c:2:14 Extent=[71:23 - 71:24] // CHECK: 71:23: DeclRefExpr=c:2:14 Extent=[71:23 - 71:24] // CHECK: 71:28: UnexposedExpr= Extent=[71:28 - 71:33] // CHECK: 71:28: IntegerLiteral= Extent=[71:28 - 71:33] -// CHECK: 72:9: BinaryOperator= Extent=[72:9 - 72:33] -// CHECK: 72:9: BinaryOperator= Extent=[72:9 - 72:19] +// CHECK: 72:8: ParenExpr= Extent=[72:8 - 72:34] +// CHECK: 72:9: BinaryOperator=&& Extent=[72:9 - 72:33] +// CHECK: 72:9: BinaryOperator=>= Extent=[72:9 - 72:19] +// CHECK: 72:9: UnexposedExpr=c:2:14 Extent=[72:9 - 72:10] // CHECK: 72:9: DeclRefExpr=c:2:14 Extent=[72:9 - 72:10] // CHECK: 72:14: UnexposedExpr= Extent=[72:14 - 72:19] // CHECK: 72:14: IntegerLiteral= Extent=[72:14 - 72:19] -// CHECK: 72:23: BinaryOperator= Extent=[72:23 - 72:33] +// CHECK: 72:23: BinaryOperator=<= Extent=[72:23 - 72:33] +// CHECK: 72:23: UnexposedExpr=c:2:14 Extent=[72:23 - 72:24] // CHECK: 72:23: DeclRefExpr=c:2:14 Extent=[72:23 - 72:24] // CHECK: 72:28: UnexposedExpr= Extent=[72:28 - 72:33] // CHECK: 72:28: IntegerLiteral= Extent=[72:28 - 72:33] -// CHECK: 73:9: BinaryOperator= Extent=[73:9 - 73:33] -// CHECK: 73:9: BinaryOperator= Extent=[73:9 - 73:19] +// CHECK: 73:8: ParenExpr= Extent=[73:8 - 73:34] +// CHECK: 73:9: BinaryOperator=&& Extent=[73:9 - 73:33] +// CHECK: 73:9: BinaryOperator=>= Extent=[73:9 - 73:19] +// CHECK: 73:9: UnexposedExpr=c:2:14 Extent=[73:9 - 73:10] // CHECK: 73:9: DeclRefExpr=c:2:14 Extent=[73:9 - 73:10] // CHECK: 73:14: UnexposedExpr= Extent=[73:14 - 73:19] // CHECK: 73:14: IntegerLiteral= Extent=[73:14 - 73:19] -// CHECK: 73:23: BinaryOperator= Extent=[73:23 - 73:33] +// CHECK: 73:23: BinaryOperator=<= Extent=[73:23 - 73:33] +// CHECK: 73:23: UnexposedExpr=c:2:14 Extent=[73:23 - 73:24] // CHECK: 73:23: DeclRefExpr=c:2:14 Extent=[73:23 - 73:24] // CHECK: 73:28: UnexposedExpr= Extent=[73:28 - 73:33] // CHECK: 73:28: IntegerLiteral= Extent=[73:28 - 73:33] -// CHECK: 74:9: BinaryOperator= Extent=[74:9 - 74:33] -// CHECK: 74:9: BinaryOperator= Extent=[74:9 - 74:19] +// CHECK: 74:8: ParenExpr= Extent=[74:8 - 74:34] +// CHECK: 74:9: BinaryOperator=&& Extent=[74:9 - 74:33] +// CHECK: 74:9: BinaryOperator=>= Extent=[74:9 - 74:19] +// CHECK: 74:9: UnexposedExpr=c:2:14 Extent=[74:9 - 74:10] // CHECK: 74:9: DeclRefExpr=c:2:14 Extent=[74:9 - 74:10] // CHECK: 74:14: UnexposedExpr= Extent=[74:14 - 74:19] // CHECK: 74:14: IntegerLiteral= Extent=[74:14 - 74:19] -// CHECK: 74:23: BinaryOperator= Extent=[74:23 - 74:33] +// CHECK: 74:23: BinaryOperator=<= Extent=[74:23 - 74:33] +// CHECK: 74:23: UnexposedExpr=c:2:14 Extent=[74:23 - 74:24] // CHECK: 74:23: DeclRefExpr=c:2:14 Extent=[74:23 - 74:24] // CHECK: 74:28: UnexposedExpr= Extent=[74:28 - 74:33] // CHECK: 74:28: IntegerLiteral= Extent=[74:28 - 74:33] -// CHECK: 75:9: BinaryOperator= Extent=[75:9 - 75:33] -// CHECK: 75:9: BinaryOperator= Extent=[75:9 - 75:19] +// CHECK: 75:8: ParenExpr= Extent=[75:8 - 75:34] +// CHECK: 75:9: BinaryOperator=&& Extent=[75:9 - 75:33] +// CHECK: 75:9: BinaryOperator=>= Extent=[75:9 - 75:19] +// CHECK: 75:9: UnexposedExpr=c:2:14 Extent=[75:9 - 75:10] // CHECK: 75:9: DeclRefExpr=c:2:14 Extent=[75:9 - 75:10] // CHECK: 75:14: UnexposedExpr= Extent=[75:14 - 75:19] // CHECK: 75:14: IntegerLiteral= Extent=[75:14 - 75:19] -// CHECK: 75:23: BinaryOperator= Extent=[75:23 - 75:33] +// CHECK: 75:23: BinaryOperator=<= Extent=[75:23 - 75:33] +// CHECK: 75:23: UnexposedExpr=c:2:14 Extent=[75:23 - 75:24] // CHECK: 75:23: DeclRefExpr=c:2:14 Extent=[75:23 - 75:24] // CHECK: 75:28: UnexposedExpr= Extent=[75:28 - 75:33] // CHECK: 75:28: IntegerLiteral= Extent=[75:28 - 75:33] -// CHECK: 76:8: BinaryOperator= Extent=[76:8 - 76:18] +// CHECK: 76:8: BinaryOperator=== Extent=[76:8 - 76:18] +// CHECK: 76:8: UnexposedExpr=c:2:14 Extent=[76:8 - 76:9] // CHECK: 76:8: DeclRefExpr=c:2:14 Extent=[76:8 - 76:9] // CHECK: 76:13: UnexposedExpr= Extent=[76:13 - 76:18] // CHECK: 76:13: IntegerLiteral= Extent=[76:13 - 76:18] -// CHECK: 76:23: BinaryOperator= Extent=[76:23 - 76:47] -// CHECK: 76:23: BinaryOperator= Extent=[76:23 - 76:33] +// CHECK: 76:22: ParenExpr= Extent=[76:22 - 76:48] +// CHECK: 76:23: BinaryOperator=&& Extent=[76:23 - 76:47] +// CHECK: 76:23: BinaryOperator=>= Extent=[76:23 - 76:33] +// CHECK: 76:23: UnexposedExpr=c:2:14 Extent=[76:23 - 76:24] // CHECK: 76:23: DeclRefExpr=c:2:14 Extent=[76:23 - 76:24] // CHECK: 76:28: UnexposedExpr= Extent=[76:28 - 76:33] // CHECK: 76:28: IntegerLiteral= Extent=[76:28 - 76:33] -// CHECK: 76:37: BinaryOperator= Extent=[76:37 - 76:47] +// CHECK: 76:37: BinaryOperator=<= Extent=[76:37 - 76:47] +// CHECK: 76:37: UnexposedExpr=c:2:14 Extent=[76:37 - 76:38] // CHECK: 76:37: DeclRefExpr=c:2:14 Extent=[76:37 - 76:38] // CHECK: 76:42: UnexposedExpr= Extent=[76:42 - 76:47] // CHECK: 76:42: IntegerLiteral= Extent=[76:42 - 76:47] -// CHECK: 77:9: BinaryOperator= Extent=[77:9 - 77:33] -// CHECK: 77:9: BinaryOperator= Extent=[77:9 - 77:19] +// CHECK: 77:8: ParenExpr= Extent=[77:8 - 77:34] +// CHECK: 77:9: BinaryOperator=&& Extent=[77:9 - 77:33] +// CHECK: 77:9: BinaryOperator=>= Extent=[77:9 - 77:19] +// CHECK: 77:9: UnexposedExpr=c:2:14 Extent=[77:9 - 77:10] // CHECK: 77:9: DeclRefExpr=c:2:14 Extent=[77:9 - 77:10] // CHECK: 77:14: UnexposedExpr= Extent=[77:14 - 77:19] // CHECK: 77:14: IntegerLiteral= Extent=[77:14 - 77:19] -// CHECK: 77:23: BinaryOperator= Extent=[77:23 - 77:33] +// CHECK: 77:23: BinaryOperator=<= Extent=[77:23 - 77:33] +// CHECK: 77:23: UnexposedExpr=c:2:14 Extent=[77:23 - 77:24] // CHECK: 77:23: DeclRefExpr=c:2:14 Extent=[77:23 - 77:24] // CHECK: 77:28: UnexposedExpr= Extent=[77:28 - 77:33] // CHECK: 77:28: IntegerLiteral= Extent=[77:28 - 77:33] -// CHECK: 78:9: BinaryOperator= Extent=[78:9 - 78:33] -// CHECK: 78:9: BinaryOperator= Extent=[78:9 - 78:19] +// CHECK: 78:8: ParenExpr= Extent=[78:8 - 78:34] +// CHECK: 78:9: BinaryOperator=&& Extent=[78:9 - 78:33] +// CHECK: 78:9: BinaryOperator=>= Extent=[78:9 - 78:19] +// CHECK: 78:9: UnexposedExpr=c:2:14 Extent=[78:9 - 78:10] // CHECK: 78:9: DeclRefExpr=c:2:14 Extent=[78:9 - 78:10] // CHECK: 78:14: UnexposedExpr= Extent=[78:14 - 78:19] // CHECK: 78:14: IntegerLiteral= Extent=[78:14 - 78:19] -// CHECK: 78:23: BinaryOperator= Extent=[78:23 - 78:33] +// CHECK: 78:23: BinaryOperator=<= Extent=[78:23 - 78:33] +// CHECK: 78:23: UnexposedExpr=c:2:14 Extent=[78:23 - 78:24] // CHECK: 78:23: DeclRefExpr=c:2:14 Extent=[78:23 - 78:24] // CHECK: 78:28: UnexposedExpr= Extent=[78:28 - 78:33] // CHECK: 78:28: IntegerLiteral= Extent=[78:28 - 78:33] -// CHECK: 79:9: BinaryOperator= Extent=[79:9 - 79:33] -// CHECK: 79:9: BinaryOperator= Extent=[79:9 - 79:19] +// CHECK: 79:8: ParenExpr= Extent=[79:8 - 79:34] +// CHECK: 79:9: BinaryOperator=&& Extent=[79:9 - 79:33] +// CHECK: 79:9: BinaryOperator=>= Extent=[79:9 - 79:19] +// CHECK: 79:9: UnexposedExpr=c:2:14 Extent=[79:9 - 79:10] // CHECK: 79:9: DeclRefExpr=c:2:14 Extent=[79:9 - 79:10] // CHECK: 79:14: UnexposedExpr= Extent=[79:14 - 79:19] // CHECK: 79:14: IntegerLiteral= Extent=[79:14 - 79:19] -// CHECK: 79:23: BinaryOperator= Extent=[79:23 - 79:33] +// CHECK: 79:23: BinaryOperator=<= Extent=[79:23 - 79:33] +// CHECK: 79:23: UnexposedExpr=c:2:14 Extent=[79:23 - 79:24] // CHECK: 79:23: DeclRefExpr=c:2:14 Extent=[79:23 - 79:24] // CHECK: 79:28: UnexposedExpr= Extent=[79:28 - 79:33] // CHECK: 79:28: IntegerLiteral= Extent=[79:28 - 79:33] -// CHECK: 80:9: BinaryOperator= Extent=[80:9 - 80:33] -// CHECK: 80:9: BinaryOperator= Extent=[80:9 - 80:19] +// CHECK: 80:8: ParenExpr= Extent=[80:8 - 80:34] +// CHECK: 80:9: BinaryOperator=&& Extent=[80:9 - 80:33] +// CHECK: 80:9: BinaryOperator=>= Extent=[80:9 - 80:19] +// CHECK: 80:9: UnexposedExpr=c:2:14 Extent=[80:9 - 80:10] // CHECK: 80:9: DeclRefExpr=c:2:14 Extent=[80:9 - 80:10] // CHECK: 80:14: UnexposedExpr= Extent=[80:14 - 80:19] // CHECK: 80:14: IntegerLiteral= Extent=[80:14 - 80:19] -// CHECK: 80:23: BinaryOperator= Extent=[80:23 - 80:33] +// CHECK: 80:23: BinaryOperator=<= Extent=[80:23 - 80:33] +// CHECK: 80:23: UnexposedExpr=c:2:14 Extent=[80:23 - 80:24] // CHECK: 80:23: DeclRefExpr=c:2:14 Extent=[80:23 - 80:24] // CHECK: 80:28: UnexposedExpr= Extent=[80:28 - 80:33] // CHECK: 80:28: IntegerLiteral= Extent=[80:28 - 80:33] -// CHECK: 81:9: BinaryOperator= Extent=[81:9 - 81:33] -// CHECK: 81:9: BinaryOperator= Extent=[81:9 - 81:19] +// CHECK: 81:8: ParenExpr= Extent=[81:8 - 81:34] +// CHECK: 81:9: BinaryOperator=&& Extent=[81:9 - 81:33] +// CHECK: 81:9: BinaryOperator=>= Extent=[81:9 - 81:19] +// CHECK: 81:9: UnexposedExpr=c:2:14 Extent=[81:9 - 81:10] // CHECK: 81:9: DeclRefExpr=c:2:14 Extent=[81:9 - 81:10] // CHECK: 81:14: UnexposedExpr= Extent=[81:14 - 81:19] // CHECK: 81:14: IntegerLiteral= Extent=[81:14 - 81:19] -// CHECK: 81:23: BinaryOperator= Extent=[81:23 - 81:33] +// CHECK: 81:23: BinaryOperator=<= Extent=[81:23 - 81:33] +// CHECK: 81:23: UnexposedExpr=c:2:14 Extent=[81:23 - 81:24] // CHECK: 81:23: DeclRefExpr=c:2:14 Extent=[81:23 - 81:24] // CHECK: 81:28: UnexposedExpr= Extent=[81:28 - 81:33] // CHECK: 81:28: IntegerLiteral= Extent=[81:28 - 81:33] -// CHECK: 82:8: BinaryOperator= Extent=[82:8 - 82:18] +// CHECK: 82:8: BinaryOperator=== Extent=[82:8 - 82:18] +// CHECK: 82:8: UnexposedExpr=c:2:14 Extent=[82:8 - 82:9] // CHECK: 82:8: DeclRefExpr=c:2:14 Extent=[82:8 - 82:9] // CHECK: 82:13: UnexposedExpr= Extent=[82:13 - 82:18] // CHECK: 82:13: IntegerLiteral= Extent=[82:13 - 82:18] -// CHECK: 82:23: BinaryOperator= Extent=[82:23 - 82:47] -// CHECK: 82:23: BinaryOperator= Extent=[82:23 - 82:33] +// CHECK: 82:22: ParenExpr= Extent=[82:22 - 82:48] +// CHECK: 82:23: BinaryOperator=&& Extent=[82:23 - 82:47] +// CHECK: 82:23: BinaryOperator=>= Extent=[82:23 - 82:33] +// CHECK: 82:23: UnexposedExpr=c:2:14 Extent=[82:23 - 82:24] // CHECK: 82:23: DeclRefExpr=c:2:14 Extent=[82:23 - 82:24] // CHECK: 82:28: UnexposedExpr= Extent=[82:28 - 82:33] // CHECK: 82:28: IntegerLiteral= Extent=[82:28 - 82:33] -// CHECK: 82:37: BinaryOperator= Extent=[82:37 - 82:47] +// CHECK: 82:37: BinaryOperator=<= Extent=[82:37 - 82:47] +// CHECK: 82:37: UnexposedExpr=c:2:14 Extent=[82:37 - 82:38] // CHECK: 82:37: DeclRefExpr=c:2:14 Extent=[82:37 - 82:38] // CHECK: 82:42: UnexposedExpr= Extent=[82:42 - 82:47] // CHECK: 82:42: IntegerLiteral= Extent=[82:42 - 82:47] -// CHECK: 83:9: BinaryOperator= Extent=[83:9 - 83:33] -// CHECK: 83:9: BinaryOperator= Extent=[83:9 - 83:19] +// CHECK: 83:8: ParenExpr= Extent=[83:8 - 83:34] +// CHECK: 83:9: BinaryOperator=&& Extent=[83:9 - 83:33] +// CHECK: 83:9: BinaryOperator=>= Extent=[83:9 - 83:19] +// CHECK: 83:9: UnexposedExpr=c:2:14 Extent=[83:9 - 83:10] // CHECK: 83:9: DeclRefExpr=c:2:14 Extent=[83:9 - 83:10] // CHECK: 83:14: UnexposedExpr= Extent=[83:14 - 83:19] // CHECK: 83:14: IntegerLiteral= Extent=[83:14 - 83:19] -// CHECK: 83:23: BinaryOperator= Extent=[83:23 - 83:33] +// CHECK: 83:23: BinaryOperator=<= Extent=[83:23 - 83:33] +// CHECK: 83:23: UnexposedExpr=c:2:14 Extent=[83:23 - 83:24] // CHECK: 83:23: DeclRefExpr=c:2:14 Extent=[83:23 - 83:24] // CHECK: 83:28: UnexposedExpr= Extent=[83:28 - 83:33] // CHECK: 83:28: IntegerLiteral= Extent=[83:28 - 83:33] -// CHECK: 84:9: BinaryOperator= Extent=[84:9 - 84:33] -// CHECK: 84:9: BinaryOperator= Extent=[84:9 - 84:19] +// CHECK: 84:8: ParenExpr= Extent=[84:8 - 84:34] +// CHECK: 84:9: BinaryOperator=&& Extent=[84:9 - 84:33] +// CHECK: 84:9: BinaryOperator=>= Extent=[84:9 - 84:19] +// CHECK: 84:9: UnexposedExpr=c:2:14 Extent=[84:9 - 84:10] // CHECK: 84:9: DeclRefExpr=c:2:14 Extent=[84:9 - 84:10] // CHECK: 84:14: UnexposedExpr= Extent=[84:14 - 84:19] // CHECK: 84:14: IntegerLiteral= Extent=[84:14 - 84:19] -// CHECK: 84:23: BinaryOperator= Extent=[84:23 - 84:33] +// CHECK: 84:23: BinaryOperator=<= Extent=[84:23 - 84:33] +// CHECK: 84:23: UnexposedExpr=c:2:14 Extent=[84:23 - 84:24] // CHECK: 84:23: DeclRefExpr=c:2:14 Extent=[84:23 - 84:24] // CHECK: 84:28: UnexposedExpr= Extent=[84:28 - 84:33] // CHECK: 84:28: IntegerLiteral= Extent=[84:28 - 84:33] -// CHECK: 85:9: BinaryOperator= Extent=[85:9 - 85:33] -// CHECK: 85:9: BinaryOperator= Extent=[85:9 - 85:19] +// CHECK: 85:8: ParenExpr= Extent=[85:8 - 85:34] +// CHECK: 85:9: BinaryOperator=&& Extent=[85:9 - 85:33] +// CHECK: 85:9: BinaryOperator=>= Extent=[85:9 - 85:19] +// CHECK: 85:9: UnexposedExpr=c:2:14 Extent=[85:9 - 85:10] // CHECK: 85:9: DeclRefExpr=c:2:14 Extent=[85:9 - 85:10] // CHECK: 85:14: UnexposedExpr= Extent=[85:14 - 85:19] // CHECK: 85:14: IntegerLiteral= Extent=[85:14 - 85:19] -// CHECK: 85:23: BinaryOperator= Extent=[85:23 - 85:33] +// CHECK: 85:23: BinaryOperator=<= Extent=[85:23 - 85:33] +// CHECK: 85:23: UnexposedExpr=c:2:14 Extent=[85:23 - 85:24] // CHECK: 85:23: DeclRefExpr=c:2:14 Extent=[85:23 - 85:24] // CHECK: 85:28: UnexposedExpr= Extent=[85:28 - 85:33] // CHECK: 85:28: IntegerLiteral= Extent=[85:28 - 85:33] -// CHECK: 86:9: BinaryOperator= Extent=[86:9 - 86:33] -// CHECK: 86:9: BinaryOperator= Extent=[86:9 - 86:19] +// CHECK: 86:8: ParenExpr= Extent=[86:8 - 86:34] +// CHECK: 86:9: BinaryOperator=&& Extent=[86:9 - 86:33] +// CHECK: 86:9: BinaryOperator=>= Extent=[86:9 - 86:19] +// CHECK: 86:9: UnexposedExpr=c:2:14 Extent=[86:9 - 86:10] // CHECK: 86:9: DeclRefExpr=c:2:14 Extent=[86:9 - 86:10] // CHECK: 86:14: UnexposedExpr= Extent=[86:14 - 86:19] // CHECK: 86:14: IntegerLiteral= Extent=[86:14 - 86:19] -// CHECK: 86:23: BinaryOperator= Extent=[86:23 - 86:33] +// CHECK: 86:23: BinaryOperator=<= Extent=[86:23 - 86:33] +// CHECK: 86:23: UnexposedExpr=c:2:14 Extent=[86:23 - 86:24] // CHECK: 86:23: DeclRefExpr=c:2:14 Extent=[86:23 - 86:24] // CHECK: 86:28: UnexposedExpr= Extent=[86:28 - 86:33] // CHECK: 86:28: IntegerLiteral= Extent=[86:28 - 86:33] -// CHECK: 87:9: BinaryOperator= Extent=[87:9 - 87:33] -// CHECK: 87:9: BinaryOperator= Extent=[87:9 - 87:19] +// CHECK: 87:8: ParenExpr= Extent=[87:8 - 87:34] +// CHECK: 87:9: BinaryOperator=&& Extent=[87:9 - 87:33] +// CHECK: 87:9: BinaryOperator=>= Extent=[87:9 - 87:19] +// CHECK: 87:9: UnexposedExpr=c:2:14 Extent=[87:9 - 87:10] // CHECK: 87:9: DeclRefExpr=c:2:14 Extent=[87:9 - 87:10] // CHECK: 87:14: UnexposedExpr= Extent=[87:14 - 87:19] // CHECK: 87:14: IntegerLiteral= Extent=[87:14 - 87:19] -// CHECK: 87:23: BinaryOperator= Extent=[87:23 - 87:33] +// CHECK: 87:23: BinaryOperator=<= Extent=[87:23 - 87:33] +// CHECK: 87:23: UnexposedExpr=c:2:14 Extent=[87:23 - 87:24] // CHECK: 87:23: DeclRefExpr=c:2:14 Extent=[87:23 - 87:24] // CHECK: 87:28: UnexposedExpr= Extent=[87:28 - 87:33] // CHECK: 87:28: IntegerLiteral= Extent=[87:28 - 87:33] -// CHECK: 88:9: BinaryOperator= Extent=[88:9 - 88:33] -// CHECK: 88:9: BinaryOperator= Extent=[88:9 - 88:19] +// CHECK: 88:8: ParenExpr= Extent=[88:8 - 88:34] +// CHECK: 88:9: BinaryOperator=&& Extent=[88:9 - 88:33] +// CHECK: 88:9: BinaryOperator=>= Extent=[88:9 - 88:19] +// CHECK: 88:9: UnexposedExpr=c:2:14 Extent=[88:9 - 88:10] // CHECK: 88:9: DeclRefExpr=c:2:14 Extent=[88:9 - 88:10] // CHECK: 88:14: UnexposedExpr= Extent=[88:14 - 88:19] // CHECK: 88:14: IntegerLiteral= Extent=[88:14 - 88:19] -// CHECK: 88:23: BinaryOperator= Extent=[88:23 - 88:33] +// CHECK: 88:23: BinaryOperator=<= Extent=[88:23 - 88:33] +// CHECK: 88:23: UnexposedExpr=c:2:14 Extent=[88:23 - 88:24] // CHECK: 88:23: DeclRefExpr=c:2:14 Extent=[88:23 - 88:24] // CHECK: 88:28: UnexposedExpr= Extent=[88:28 - 88:33] // CHECK: 88:28: IntegerLiteral= Extent=[88:28 - 88:33] -// CHECK: 89:9: BinaryOperator= Extent=[89:9 - 89:33] -// CHECK: 89:9: BinaryOperator= Extent=[89:9 - 89:19] +// CHECK: 89:8: ParenExpr= Extent=[89:8 - 89:34] +// CHECK: 89:9: BinaryOperator=&& Extent=[89:9 - 89:33] +// CHECK: 89:9: BinaryOperator=>= Extent=[89:9 - 89:19] +// CHECK: 89:9: UnexposedExpr=c:2:14 Extent=[89:9 - 89:10] // CHECK: 89:9: DeclRefExpr=c:2:14 Extent=[89:9 - 89:10] // CHECK: 89:14: UnexposedExpr= Extent=[89:14 - 89:19] // CHECK: 89:14: IntegerLiteral= Extent=[89:14 - 89:19] -// CHECK: 89:23: BinaryOperator= Extent=[89:23 - 89:33] +// CHECK: 89:23: BinaryOperator=<= Extent=[89:23 - 89:33] +// CHECK: 89:23: UnexposedExpr=c:2:14 Extent=[89:23 - 89:24] // CHECK: 89:23: DeclRefExpr=c:2:14 Extent=[89:23 - 89:24] // CHECK: 89:28: UnexposedExpr= Extent=[89:28 - 89:33] // CHECK: 89:28: IntegerLiteral= Extent=[89:28 - 89:33] -// CHECK: 90:9: BinaryOperator= Extent=[90:9 - 90:33] -// CHECK: 90:9: BinaryOperator= Extent=[90:9 - 90:19] +// CHECK: 90:8: ParenExpr= Extent=[90:8 - 90:34] +// CHECK: 90:9: BinaryOperator=&& Extent=[90:9 - 90:33] +// CHECK: 90:9: BinaryOperator=>= Extent=[90:9 - 90:19] +// CHECK: 90:9: UnexposedExpr=c:2:14 Extent=[90:9 - 90:10] // CHECK: 90:9: DeclRefExpr=c:2:14 Extent=[90:9 - 90:10] // CHECK: 90:14: UnexposedExpr= Extent=[90:14 - 90:19] // CHECK: 90:14: IntegerLiteral= Extent=[90:14 - 90:19] -// CHECK: 90:23: BinaryOperator= Extent=[90:23 - 90:33] +// CHECK: 90:23: BinaryOperator=<= Extent=[90:23 - 90:33] +// CHECK: 90:23: UnexposedExpr=c:2:14 Extent=[90:23 - 90:24] // CHECK: 90:23: DeclRefExpr=c:2:14 Extent=[90:23 - 90:24] // CHECK: 90:28: UnexposedExpr= Extent=[90:28 - 90:33] // CHECK: 90:28: IntegerLiteral= Extent=[90:28 - 90:33] -// CHECK: 91:9: BinaryOperator= Extent=[91:9 - 91:33] -// CHECK: 91:9: BinaryOperator= Extent=[91:9 - 91:19] +// CHECK: 91:8: ParenExpr= Extent=[91:8 - 91:34] +// CHECK: 91:9: BinaryOperator=&& Extent=[91:9 - 91:33] +// CHECK: 91:9: BinaryOperator=>= Extent=[91:9 - 91:19] +// CHECK: 91:9: UnexposedExpr=c:2:14 Extent=[91:9 - 91:10] // CHECK: 91:9: DeclRefExpr=c:2:14 Extent=[91:9 - 91:10] // CHECK: 91:14: UnexposedExpr= Extent=[91:14 - 91:19] // CHECK: 91:14: IntegerLiteral= Extent=[91:14 - 91:19] -// CHECK: 91:23: BinaryOperator= Extent=[91:23 - 91:33] +// CHECK: 91:23: BinaryOperator=<= Extent=[91:23 - 91:33] +// CHECK: 91:23: UnexposedExpr=c:2:14 Extent=[91:23 - 91:24] // CHECK: 91:23: DeclRefExpr=c:2:14 Extent=[91:23 - 91:24] // CHECK: 91:28: UnexposedExpr= Extent=[91:28 - 91:33] // CHECK: 91:28: IntegerLiteral= Extent=[91:28 - 91:33] -// CHECK: 92:9: BinaryOperator= Extent=[92:9 - 92:33] -// CHECK: 92:9: BinaryOperator= Extent=[92:9 - 92:19] +// CHECK: 92:8: ParenExpr= Extent=[92:8 - 92:34] +// CHECK: 92:9: BinaryOperator=&& Extent=[92:9 - 92:33] +// CHECK: 92:9: BinaryOperator=>= Extent=[92:9 - 92:19] +// CHECK: 92:9: UnexposedExpr=c:2:14 Extent=[92:9 - 92:10] // CHECK: 92:9: DeclRefExpr=c:2:14 Extent=[92:9 - 92:10] // CHECK: 92:14: UnexposedExpr= Extent=[92:14 - 92:19] // CHECK: 92:14: IntegerLiteral= Extent=[92:14 - 92:19] -// CHECK: 92:23: BinaryOperator= Extent=[92:23 - 92:33] +// CHECK: 92:23: BinaryOperator=<= Extent=[92:23 - 92:33] +// CHECK: 92:23: UnexposedExpr=c:2:14 Extent=[92:23 - 92:24] // CHECK: 92:23: DeclRefExpr=c:2:14 Extent=[92:23 - 92:24] // CHECK: 92:28: UnexposedExpr= Extent=[92:28 - 92:33] // CHECK: 92:28: IntegerLiteral= Extent=[92:28 - 92:33] -// CHECK: 93:9: BinaryOperator= Extent=[93:9 - 93:33] -// CHECK: 93:9: BinaryOperator= Extent=[93:9 - 93:19] +// CHECK: 93:8: ParenExpr= Extent=[93:8 - 93:34] +// CHECK: 93:9: BinaryOperator=&& Extent=[93:9 - 93:33] +// CHECK: 93:9: BinaryOperator=>= Extent=[93:9 - 93:19] +// CHECK: 93:9: UnexposedExpr=c:2:14 Extent=[93:9 - 93:10] // CHECK: 93:9: DeclRefExpr=c:2:14 Extent=[93:9 - 93:10] // CHECK: 93:14: UnexposedExpr= Extent=[93:14 - 93:19] // CHECK: 93:14: IntegerLiteral= Extent=[93:14 - 93:19] -// CHECK: 93:23: BinaryOperator= Extent=[93:23 - 93:33] +// CHECK: 93:23: BinaryOperator=<= Extent=[93:23 - 93:33] +// CHECK: 93:23: UnexposedExpr=c:2:14 Extent=[93:23 - 93:24] // CHECK: 93:23: DeclRefExpr=c:2:14 Extent=[93:23 - 93:24] // CHECK: 93:28: UnexposedExpr= Extent=[93:28 - 93:33] // CHECK: 93:28: IntegerLiteral= Extent=[93:28 - 93:33] -// CHECK: 94:9: BinaryOperator= Extent=[94:9 - 94:33] -// CHECK: 94:9: BinaryOperator= Extent=[94:9 - 94:19] +// CHECK: 94:8: ParenExpr= Extent=[94:8 - 94:34] +// CHECK: 94:9: BinaryOperator=&& Extent=[94:9 - 94:33] +// CHECK: 94:9: BinaryOperator=>= Extent=[94:9 - 94:19] +// CHECK: 94:9: UnexposedExpr=c:2:14 Extent=[94:9 - 94:10] // CHECK: 94:9: DeclRefExpr=c:2:14 Extent=[94:9 - 94:10] // CHECK: 94:14: UnexposedExpr= Extent=[94:14 - 94:19] // CHECK: 94:14: IntegerLiteral= Extent=[94:14 - 94:19] -// CHECK: 94:23: BinaryOperator= Extent=[94:23 - 94:33] +// CHECK: 94:23: BinaryOperator=<= Extent=[94:23 - 94:33] +// CHECK: 94:23: UnexposedExpr=c:2:14 Extent=[94:23 - 94:24] // CHECK: 94:23: DeclRefExpr=c:2:14 Extent=[94:23 - 94:24] // CHECK: 94:28: UnexposedExpr= Extent=[94:28 - 94:33] // CHECK: 94:28: IntegerLiteral= Extent=[94:28 - 94:33] -// CHECK: 95:9: BinaryOperator= Extent=[95:9 - 95:33] -// CHECK: 95:9: BinaryOperator= Extent=[95:9 - 95:19] +// CHECK: 95:8: ParenExpr= Extent=[95:8 - 95:34] +// CHECK: 95:9: BinaryOperator=&& Extent=[95:9 - 95:33] +// CHECK: 95:9: BinaryOperator=>= Extent=[95:9 - 95:19] +// CHECK: 95:9: UnexposedExpr=c:2:14 Extent=[95:9 - 95:10] // CHECK: 95:9: DeclRefExpr=c:2:14 Extent=[95:9 - 95:10] // CHECK: 95:14: UnexposedExpr= Extent=[95:14 - 95:19] // CHECK: 95:14: IntegerLiteral= Extent=[95:14 - 95:19] -// CHECK: 95:23: BinaryOperator= Extent=[95:23 - 95:33] +// CHECK: 95:23: BinaryOperator=<= Extent=[95:23 - 95:33] +// CHECK: 95:23: UnexposedExpr=c:2:14 Extent=[95:23 - 95:24] // CHECK: 95:23: DeclRefExpr=c:2:14 Extent=[95:23 - 95:24] // CHECK: 95:28: UnexposedExpr= Extent=[95:28 - 95:33] // CHECK: 95:28: IntegerLiteral= Extent=[95:28 - 95:33] -// CHECK: 96:9: BinaryOperator= Extent=[96:9 - 96:33] -// CHECK: 96:9: BinaryOperator= Extent=[96:9 - 96:19] +// CHECK: 96:8: ParenExpr= Extent=[96:8 - 96:34] +// CHECK: 96:9: BinaryOperator=&& Extent=[96:9 - 96:33] +// CHECK: 96:9: BinaryOperator=>= Extent=[96:9 - 96:19] +// CHECK: 96:9: UnexposedExpr=c:2:14 Extent=[96:9 - 96:10] // CHECK: 96:9: DeclRefExpr=c:2:14 Extent=[96:9 - 96:10] // CHECK: 96:14: UnexposedExpr= Extent=[96:14 - 96:19] // CHECK: 96:14: IntegerLiteral= Extent=[96:14 - 96:19] -// CHECK: 96:23: BinaryOperator= Extent=[96:23 - 96:33] +// CHECK: 96:23: BinaryOperator=<= Extent=[96:23 - 96:33] +// CHECK: 96:23: UnexposedExpr=c:2:14 Extent=[96:23 - 96:24] // CHECK: 96:23: DeclRefExpr=c:2:14 Extent=[96:23 - 96:24] // CHECK: 96:28: UnexposedExpr= Extent=[96:28 - 96:33] // CHECK: 96:28: IntegerLiteral= Extent=[96:28 - 96:33] -// CHECK: 97:9: BinaryOperator= Extent=[97:9 - 97:33] -// CHECK: 97:9: BinaryOperator= Extent=[97:9 - 97:19] +// CHECK: 97:8: ParenExpr= Extent=[97:8 - 97:34] +// CHECK: 97:9: BinaryOperator=&& Extent=[97:9 - 97:33] +// CHECK: 97:9: BinaryOperator=>= Extent=[97:9 - 97:19] +// CHECK: 97:9: UnexposedExpr=c:2:14 Extent=[97:9 - 97:10] // CHECK: 97:9: DeclRefExpr=c:2:14 Extent=[97:9 - 97:10] // CHECK: 97:14: UnexposedExpr= Extent=[97:14 - 97:19] // CHECK: 97:14: IntegerLiteral= Extent=[97:14 - 97:19] -// CHECK: 97:23: BinaryOperator= Extent=[97:23 - 97:33] +// CHECK: 97:23: BinaryOperator=<= Extent=[97:23 - 97:33] +// CHECK: 97:23: UnexposedExpr=c:2:14 Extent=[97:23 - 97:24] // CHECK: 97:23: DeclRefExpr=c:2:14 Extent=[97:23 - 97:24] // CHECK: 97:28: UnexposedExpr= Extent=[97:28 - 97:33] // CHECK: 97:28: IntegerLiteral= Extent=[97:28 - 97:33] -// CHECK: 98:8: BinaryOperator= Extent=[98:8 - 98:18] +// CHECK: 98:8: BinaryOperator=== Extent=[98:8 - 98:18] +// CHECK: 98:8: UnexposedExpr=c:2:14 Extent=[98:8 - 98:9] // CHECK: 98:8: DeclRefExpr=c:2:14 Extent=[98:8 - 98:9] // CHECK: 98:13: UnexposedExpr= Extent=[98:13 - 98:18] // CHECK: 98:13: IntegerLiteral= Extent=[98:13 - 98:18] -// CHECK: 98:23: BinaryOperator= Extent=[98:23 - 98:47] -// CHECK: 98:23: BinaryOperator= Extent=[98:23 - 98:33] +// CHECK: 98:22: ParenExpr= Extent=[98:22 - 98:48] +// CHECK: 98:23: BinaryOperator=&& Extent=[98:23 - 98:47] +// CHECK: 98:23: BinaryOperator=>= Extent=[98:23 - 98:33] +// CHECK: 98:23: UnexposedExpr=c:2:14 Extent=[98:23 - 98:24] // CHECK: 98:23: DeclRefExpr=c:2:14 Extent=[98:23 - 98:24] // CHECK: 98:28: UnexposedExpr= Extent=[98:28 - 98:33] // CHECK: 98:28: IntegerLiteral= Extent=[98:28 - 98:33] -// CHECK: 98:37: BinaryOperator= Extent=[98:37 - 98:47] +// CHECK: 98:37: BinaryOperator=<= Extent=[98:37 - 98:47] +// CHECK: 98:37: UnexposedExpr=c:2:14 Extent=[98:37 - 98:38] // CHECK: 98:37: DeclRefExpr=c:2:14 Extent=[98:37 - 98:38] // CHECK: 98:42: UnexposedExpr= Extent=[98:42 - 98:47] // CHECK: 98:42: IntegerLiteral= Extent=[98:42 - 98:47] -// CHECK: 99:9: BinaryOperator= Extent=[99:9 - 99:33] -// CHECK: 99:9: BinaryOperator= Extent=[99:9 - 99:19] +// CHECK: 99:8: ParenExpr= Extent=[99:8 - 99:34] +// CHECK: 99:9: BinaryOperator=&& Extent=[99:9 - 99:33] +// CHECK: 99:9: BinaryOperator=>= Extent=[99:9 - 99:19] +// CHECK: 99:9: UnexposedExpr=c:2:14 Extent=[99:9 - 99:10] // CHECK: 99:9: DeclRefExpr=c:2:14 Extent=[99:9 - 99:10] // CHECK: 99:14: UnexposedExpr= Extent=[99:14 - 99:19] // CHECK: 99:14: IntegerLiteral= Extent=[99:14 - 99:19] -// CHECK: 99:23: BinaryOperator= Extent=[99:23 - 99:33] +// CHECK: 99:23: BinaryOperator=<= Extent=[99:23 - 99:33] +// CHECK: 99:23: UnexposedExpr=c:2:14 Extent=[99:23 - 99:24] // CHECK: 99:23: DeclRefExpr=c:2:14 Extent=[99:23 - 99:24] // CHECK: 99:28: UnexposedExpr= Extent=[99:28 - 99:33] // CHECK: 99:28: IntegerLiteral= Extent=[99:28 - 99:33] -// CHECK: 100:9: BinaryOperator= Extent=[100:9 - 100:33] -// CHECK: 100:9: BinaryOperator= Extent=[100:9 - 100:19] +// CHECK: 100:8: ParenExpr= Extent=[100:8 - 100:34] +// CHECK: 100:9: BinaryOperator=&& Extent=[100:9 - 100:33] +// CHECK: 100:9: BinaryOperator=>= Extent=[100:9 - 100:19] +// CHECK: 100:9: UnexposedExpr=c:2:14 Extent=[100:9 - 100:10] // CHECK: 100:9: DeclRefExpr=c:2:14 Extent=[100:9 - 100:10] // CHECK: 100:14: UnexposedExpr= Extent=[100:14 - 100:19] // CHECK: 100:14: IntegerLiteral= Extent=[100:14 - 100:19] -// CHECK: 100:23: BinaryOperator= Extent=[100:23 - 100:33] +// CHECK: 100:23: BinaryOperator=<= Extent=[100:23 - 100:33] +// CHECK: 100:23: UnexposedExpr=c:2:14 Extent=[100:23 - 100:24] // CHECK: 100:23: DeclRefExpr=c:2:14 Extent=[100:23 - 100:24] // CHECK: 100:28: UnexposedExpr= Extent=[100:28 - 100:33] // CHECK: 100:28: IntegerLiteral= Extent=[100:28 - 100:33] -// CHECK: 101:9: BinaryOperator= Extent=[101:9 - 101:33] -// CHECK: 101:9: BinaryOperator= Extent=[101:9 - 101:19] +// CHECK: 101:8: ParenExpr= Extent=[101:8 - 101:34] +// CHECK: 101:9: BinaryOperator=&& Extent=[101:9 - 101:33] +// CHECK: 101:9: BinaryOperator=>= Extent=[101:9 - 101:19] +// CHECK: 101:9: UnexposedExpr=c:2:14 Extent=[101:9 - 101:10] // CHECK: 101:9: DeclRefExpr=c:2:14 Extent=[101:9 - 101:10] // CHECK: 101:14: UnexposedExpr= Extent=[101:14 - 101:19] // CHECK: 101:14: IntegerLiteral= Extent=[101:14 - 101:19] -// CHECK: 101:23: BinaryOperator= Extent=[101:23 - 101:33] +// CHECK: 101:23: BinaryOperator=<= Extent=[101:23 - 101:33] +// CHECK: 101:23: UnexposedExpr=c:2:14 Extent=[101:23 - 101:24] // CHECK: 101:23: DeclRefExpr=c:2:14 Extent=[101:23 - 101:24] // CHECK: 101:28: UnexposedExpr= Extent=[101:28 - 101:33] // CHECK: 101:28: IntegerLiteral= Extent=[101:28 - 101:33] -// CHECK: 102:9: BinaryOperator= Extent=[102:9 - 102:33] -// CHECK: 102:9: BinaryOperator= Extent=[102:9 - 102:19] +// CHECK: 102:8: ParenExpr= Extent=[102:8 - 102:34] +// CHECK: 102:9: BinaryOperator=&& Extent=[102:9 - 102:33] +// CHECK: 102:9: BinaryOperator=>= Extent=[102:9 - 102:19] +// CHECK: 102:9: UnexposedExpr=c:2:14 Extent=[102:9 - 102:10] // CHECK: 102:9: DeclRefExpr=c:2:14 Extent=[102:9 - 102:10] // CHECK: 102:14: UnexposedExpr= Extent=[102:14 - 102:19] // CHECK: 102:14: IntegerLiteral= Extent=[102:14 - 102:19] -// CHECK: 102:23: BinaryOperator= Extent=[102:23 - 102:33] +// CHECK: 102:23: BinaryOperator=<= Extent=[102:23 - 102:33] +// CHECK: 102:23: UnexposedExpr=c:2:14 Extent=[102:23 - 102:24] // CHECK: 102:23: DeclRefExpr=c:2:14 Extent=[102:23 - 102:24] // CHECK: 102:28: UnexposedExpr= Extent=[102:28 - 102:33] // CHECK: 102:28: IntegerLiteral= Extent=[102:28 - 102:33] -// CHECK: 103:9: BinaryOperator= Extent=[103:9 - 103:33] -// CHECK: 103:9: BinaryOperator= Extent=[103:9 - 103:19] +// CHECK: 103:8: ParenExpr= Extent=[103:8 - 103:34] +// CHECK: 103:9: BinaryOperator=&& Extent=[103:9 - 103:33] +// CHECK: 103:9: BinaryOperator=>= Extent=[103:9 - 103:19] +// CHECK: 103:9: UnexposedExpr=c:2:14 Extent=[103:9 - 103:10] // CHECK: 103:9: DeclRefExpr=c:2:14 Extent=[103:9 - 103:10] // CHECK: 103:14: UnexposedExpr= Extent=[103:14 - 103:19] // CHECK: 103:14: IntegerLiteral= Extent=[103:14 - 103:19] -// CHECK: 103:23: BinaryOperator= Extent=[103:23 - 103:33] +// CHECK: 103:23: BinaryOperator=<= Extent=[103:23 - 103:33] +// CHECK: 103:23: UnexposedExpr=c:2:14 Extent=[103:23 - 103:24] // CHECK: 103:23: DeclRefExpr=c:2:14 Extent=[103:23 - 103:24] // CHECK: 103:28: UnexposedExpr= Extent=[103:28 - 103:33] // CHECK: 103:28: IntegerLiteral= Extent=[103:28 - 103:33] -// CHECK: 104:9: BinaryOperator= Extent=[104:9 - 104:33] -// CHECK: 104:9: BinaryOperator= Extent=[104:9 - 104:19] +// CHECK: 104:8: ParenExpr= Extent=[104:8 - 104:34] +// CHECK: 104:9: BinaryOperator=&& Extent=[104:9 - 104:33] +// CHECK: 104:9: BinaryOperator=>= Extent=[104:9 - 104:19] +// CHECK: 104:9: UnexposedExpr=c:2:14 Extent=[104:9 - 104:10] // CHECK: 104:9: DeclRefExpr=c:2:14 Extent=[104:9 - 104:10] // CHECK: 104:14: UnexposedExpr= Extent=[104:14 - 104:19] // CHECK: 104:14: IntegerLiteral= Extent=[104:14 - 104:19] -// CHECK: 104:23: BinaryOperator= Extent=[104:23 - 104:33] +// CHECK: 104:23: BinaryOperator=<= Extent=[104:23 - 104:33] +// CHECK: 104:23: UnexposedExpr=c:2:14 Extent=[104:23 - 104:24] // CHECK: 104:23: DeclRefExpr=c:2:14 Extent=[104:23 - 104:24] // CHECK: 104:28: UnexposedExpr= Extent=[104:28 - 104:33] // CHECK: 104:28: IntegerLiteral= Extent=[104:28 - 104:33] -// CHECK: 105:8: BinaryOperator= Extent=[105:8 - 105:18] +// CHECK: 105:8: BinaryOperator=== Extent=[105:8 - 105:18] +// CHECK: 105:8: UnexposedExpr=c:2:14 Extent=[105:8 - 105:9] // CHECK: 105:8: DeclRefExpr=c:2:14 Extent=[105:8 - 105:9] // CHECK: 105:13: UnexposedExpr= Extent=[105:13 - 105:18] // CHECK: 105:13: IntegerLiteral= Extent=[105:13 - 105:18] -// CHECK: 105:23: BinaryOperator= Extent=[105:23 - 105:47] -// CHECK: 105:23: BinaryOperator= Extent=[105:23 - 105:33] +// CHECK: 105:22: ParenExpr= Extent=[105:22 - 105:48] +// CHECK: 105:23: BinaryOperator=&& Extent=[105:23 - 105:47] +// CHECK: 105:23: BinaryOperator=>= Extent=[105:23 - 105:33] +// CHECK: 105:23: UnexposedExpr=c:2:14 Extent=[105:23 - 105:24] // CHECK: 105:23: DeclRefExpr=c:2:14 Extent=[105:23 - 105:24] // CHECK: 105:28: UnexposedExpr= Extent=[105:28 - 105:33] // CHECK: 105:28: IntegerLiteral= Extent=[105:28 - 105:33] -// CHECK: 105:37: BinaryOperator= Extent=[105:37 - 105:47] +// CHECK: 105:37: BinaryOperator=<= Extent=[105:37 - 105:47] +// CHECK: 105:37: UnexposedExpr=c:2:14 Extent=[105:37 - 105:38] // CHECK: 105:37: DeclRefExpr=c:2:14 Extent=[105:37 - 105:38] // CHECK: 105:42: UnexposedExpr= Extent=[105:42 - 105:47] // CHECK: 105:42: IntegerLiteral= Extent=[105:42 - 105:47] -// CHECK: 106:9: BinaryOperator= Extent=[106:9 - 106:33] -// CHECK: 106:9: BinaryOperator= Extent=[106:9 - 106:19] +// CHECK: 106:8: ParenExpr= Extent=[106:8 - 106:34] +// CHECK: 106:9: BinaryOperator=&& Extent=[106:9 - 106:33] +// CHECK: 106:9: BinaryOperator=>= Extent=[106:9 - 106:19] +// CHECK: 106:9: UnexposedExpr=c:2:14 Extent=[106:9 - 106:10] // CHECK: 106:9: DeclRefExpr=c:2:14 Extent=[106:9 - 106:10] // CHECK: 106:14: UnexposedExpr= Extent=[106:14 - 106:19] // CHECK: 106:14: IntegerLiteral= Extent=[106:14 - 106:19] -// CHECK: 106:23: BinaryOperator= Extent=[106:23 - 106:33] +// CHECK: 106:23: BinaryOperator=<= Extent=[106:23 - 106:33] +// CHECK: 106:23: UnexposedExpr=c:2:14 Extent=[106:23 - 106:24] // CHECK: 106:23: DeclRefExpr=c:2:14 Extent=[106:23 - 106:24] // CHECK: 106:28: UnexposedExpr= Extent=[106:28 - 106:33] // CHECK: 106:28: IntegerLiteral= Extent=[106:28 - 106:33] -// CHECK: 107:9: BinaryOperator= Extent=[107:9 - 107:33] -// CHECK: 107:9: BinaryOperator= Extent=[107:9 - 107:19] +// CHECK: 107:8: ParenExpr= Extent=[107:8 - 107:34] +// CHECK: 107:9: BinaryOperator=&& Extent=[107:9 - 107:33] +// CHECK: 107:9: BinaryOperator=>= Extent=[107:9 - 107:19] +// CHECK: 107:9: UnexposedExpr=c:2:14 Extent=[107:9 - 107:10] // CHECK: 107:9: DeclRefExpr=c:2:14 Extent=[107:9 - 107:10] // CHECK: 107:14: UnexposedExpr= Extent=[107:14 - 107:19] // CHECK: 107:14: IntegerLiteral= Extent=[107:14 - 107:19] -// CHECK: 107:23: BinaryOperator= Extent=[107:23 - 107:33] +// CHECK: 107:23: BinaryOperator=<= Extent=[107:23 - 107:33] +// CHECK: 107:23: UnexposedExpr=c:2:14 Extent=[107:23 - 107:24] // CHECK: 107:23: DeclRefExpr=c:2:14 Extent=[107:23 - 107:24] // CHECK: 107:28: UnexposedExpr= Extent=[107:28 - 107:33] // CHECK: 107:28: IntegerLiteral= Extent=[107:28 - 107:33] -// CHECK: 108:8: BinaryOperator= Extent=[108:8 - 108:18] +// CHECK: 108:8: BinaryOperator=== Extent=[108:8 - 108:18] +// CHECK: 108:8: UnexposedExpr=c:2:14 Extent=[108:8 - 108:9] // CHECK: 108:8: DeclRefExpr=c:2:14 Extent=[108:8 - 108:9] // CHECK: 108:13: UnexposedExpr= Extent=[108:13 - 108:18] // CHECK: 108:13: IntegerLiteral= Extent=[108:13 - 108:18] -// CHECK: 108:23: BinaryOperator= Extent=[108:23 - 108:47] -// CHECK: 108:23: BinaryOperator= Extent=[108:23 - 108:33] +// CHECK: 108:22: ParenExpr= Extent=[108:22 - 108:48] +// CHECK: 108:23: BinaryOperator=&& Extent=[108:23 - 108:47] +// CHECK: 108:23: BinaryOperator=>= Extent=[108:23 - 108:33] +// CHECK: 108:23: UnexposedExpr=c:2:14 Extent=[108:23 - 108:24] // CHECK: 108:23: DeclRefExpr=c:2:14 Extent=[108:23 - 108:24] // CHECK: 108:28: UnexposedExpr= Extent=[108:28 - 108:33] // CHECK: 108:28: IntegerLiteral= Extent=[108:28 - 108:33] -// CHECK: 108:37: BinaryOperator= Extent=[108:37 - 108:47] +// CHECK: 108:37: BinaryOperator=<= Extent=[108:37 - 108:47] +// CHECK: 108:37: UnexposedExpr=c:2:14 Extent=[108:37 - 108:38] // CHECK: 108:37: DeclRefExpr=c:2:14 Extent=[108:37 - 108:38] // CHECK: 108:42: UnexposedExpr= Extent=[108:42 - 108:47] // CHECK: 108:42: IntegerLiteral= Extent=[108:42 - 108:47] -// CHECK: 109:8: BinaryOperator= Extent=[109:8 - 109:18] +// CHECK: 109:8: BinaryOperator=== Extent=[109:8 - 109:18] +// CHECK: 109:8: UnexposedExpr=c:2:14 Extent=[109:8 - 109:9] // CHECK: 109:8: DeclRefExpr=c:2:14 Extent=[109:8 - 109:9] // CHECK: 109:13: UnexposedExpr= Extent=[109:13 - 109:18] // CHECK: 109:13: IntegerLiteral= Extent=[109:13 - 109:18] -// CHECK: 109:22: BinaryOperator= Extent=[109:22 - 109:32] +// CHECK: 109:22: BinaryOperator=== Extent=[109:22 - 109:32] +// CHECK: 109:22: UnexposedExpr=c:2:14 Extent=[109:22 - 109:23] // CHECK: 109:22: DeclRefExpr=c:2:14 Extent=[109:22 - 109:23] // CHECK: 109:27: UnexposedExpr= Extent=[109:27 - 109:32] // CHECK: 109:27: IntegerLiteral= Extent=[109:27 - 109:32] -// CHECK: 109:37: BinaryOperator= Extent=[109:37 - 109:61] -// CHECK: 109:37: BinaryOperator= Extent=[109:37 - 109:47] +// CHECK: 109:36: ParenExpr= Extent=[109:36 - 109:62] +// CHECK: 109:37: BinaryOperator=&& Extent=[109:37 - 109:61] +// CHECK: 109:37: BinaryOperator=>= Extent=[109:37 - 109:47] +// CHECK: 109:37: UnexposedExpr=c:2:14 Extent=[109:37 - 109:38] // CHECK: 109:37: DeclRefExpr=c:2:14 Extent=[109:37 - 109:38] // CHECK: 109:42: UnexposedExpr= Extent=[109:42 - 109:47] // CHECK: 109:42: IntegerLiteral= Extent=[109:42 - 109:47] -// CHECK: 109:51: BinaryOperator= Extent=[109:51 - 109:61] +// CHECK: 109:51: BinaryOperator=<= Extent=[109:51 - 109:61] +// CHECK: 109:51: UnexposedExpr=c:2:14 Extent=[109:51 - 109:52] // CHECK: 109:51: DeclRefExpr=c:2:14 Extent=[109:51 - 109:52] // CHECK: 109:56: UnexposedExpr= Extent=[109:56 - 109:61] // CHECK: 109:56: IntegerLiteral= Extent=[109:56 - 109:61] -// CHECK: 110:9: BinaryOperator= Extent=[110:9 - 110:33] -// CHECK: 110:9: BinaryOperator= Extent=[110:9 - 110:19] +// CHECK: 110:8: ParenExpr= Extent=[110:8 - 110:34] +// CHECK: 110:9: BinaryOperator=&& Extent=[110:9 - 110:33] +// CHECK: 110:9: BinaryOperator=>= Extent=[110:9 - 110:19] +// CHECK: 110:9: UnexposedExpr=c:2:14 Extent=[110:9 - 110:10] // CHECK: 110:9: DeclRefExpr=c:2:14 Extent=[110:9 - 110:10] // CHECK: 110:14: UnexposedExpr= Extent=[110:14 - 110:19] // CHECK: 110:14: IntegerLiteral= Extent=[110:14 - 110:19] -// CHECK: 110:23: BinaryOperator= Extent=[110:23 - 110:33] +// CHECK: 110:23: BinaryOperator=<= Extent=[110:23 - 110:33] +// CHECK: 110:23: UnexposedExpr=c:2:14 Extent=[110:23 - 110:24] // CHECK: 110:23: DeclRefExpr=c:2:14 Extent=[110:23 - 110:24] // CHECK: 110:28: UnexposedExpr= Extent=[110:28 - 110:33] // CHECK: 110:28: IntegerLiteral= Extent=[110:28 - 110:33] -// CHECK: 111:9: BinaryOperator= Extent=[111:9 - 111:33] -// CHECK: 111:9: BinaryOperator= Extent=[111:9 - 111:19] +// CHECK: 111:8: ParenExpr= Extent=[111:8 - 111:34] +// CHECK: 111:9: BinaryOperator=&& Extent=[111:9 - 111:33] +// CHECK: 111:9: BinaryOperator=>= Extent=[111:9 - 111:19] +// CHECK: 111:9: UnexposedExpr=c:2:14 Extent=[111:9 - 111:10] // CHECK: 111:9: DeclRefExpr=c:2:14 Extent=[111:9 - 111:10] // CHECK: 111:14: UnexposedExpr= Extent=[111:14 - 111:19] // CHECK: 111:14: IntegerLiteral= Extent=[111:14 - 111:19] -// CHECK: 111:23: BinaryOperator= Extent=[111:23 - 111:33] +// CHECK: 111:23: BinaryOperator=<= Extent=[111:23 - 111:33] +// CHECK: 111:23: UnexposedExpr=c:2:14 Extent=[111:23 - 111:24] // CHECK: 111:23: DeclRefExpr=c:2:14 Extent=[111:23 - 111:24] // CHECK: 111:28: UnexposedExpr= Extent=[111:28 - 111:33] // CHECK: 111:28: IntegerLiteral= Extent=[111:28 - 111:33] -// CHECK: 112:8: BinaryOperator= Extent=[112:8 - 112:18] +// CHECK: 112:8: BinaryOperator=== Extent=[112:8 - 112:18] +// CHECK: 112:8: UnexposedExpr=c:2:14 Extent=[112:8 - 112:9] // CHECK: 112:8: DeclRefExpr=c:2:14 Extent=[112:8 - 112:9] // CHECK: 112:13: UnexposedExpr= Extent=[112:13 - 112:18] // CHECK: 112:13: IntegerLiteral= Extent=[112:13 - 112:18] -// CHECK: 112:22: BinaryOperator= Extent=[112:22 - 112:32] +// CHECK: 112:22: BinaryOperator=== Extent=[112:22 - 112:32] +// CHECK: 112:22: UnexposedExpr=c:2:14 Extent=[112:22 - 112:23] // CHECK: 112:22: DeclRefExpr=c:2:14 Extent=[112:22 - 112:23] // CHECK: 112:27: UnexposedExpr= Extent=[112:27 - 112:32] // CHECK: 112:27: IntegerLiteral= Extent=[112:27 - 112:32] -// CHECK: 112:37: BinaryOperator= Extent=[112:37 - 112:61] -// CHECK: 112:37: BinaryOperator= Extent=[112:37 - 112:47] +// CHECK: 112:36: ParenExpr= Extent=[112:36 - 112:62] +// CHECK: 112:37: BinaryOperator=&& Extent=[112:37 - 112:61] +// CHECK: 112:37: BinaryOperator=>= Extent=[112:37 - 112:47] +// CHECK: 112:37: UnexposedExpr=c:2:14 Extent=[112:37 - 112:38] // CHECK: 112:37: DeclRefExpr=c:2:14 Extent=[112:37 - 112:38] // CHECK: 112:42: UnexposedExpr= Extent=[112:42 - 112:47] // CHECK: 112:42: IntegerLiteral= Extent=[112:42 - 112:47] -// CHECK: 112:51: BinaryOperator= Extent=[112:51 - 112:61] +// CHECK: 112:51: BinaryOperator=<= Extent=[112:51 - 112:61] +// CHECK: 112:51: UnexposedExpr=c:2:14 Extent=[112:51 - 112:52] // CHECK: 112:51: DeclRefExpr=c:2:14 Extent=[112:51 - 112:52] // CHECK: 112:56: UnexposedExpr= Extent=[112:56 - 112:61] // CHECK: 112:56: IntegerLiteral= Extent=[112:56 - 112:61] -// CHECK: 113:9: BinaryOperator= Extent=[113:9 - 113:33] -// CHECK: 113:9: BinaryOperator= Extent=[113:9 - 113:19] +// CHECK: 113:8: ParenExpr= Extent=[113:8 - 113:34] +// CHECK: 113:9: BinaryOperator=&& Extent=[113:9 - 113:33] +// CHECK: 113:9: BinaryOperator=>= Extent=[113:9 - 113:19] +// CHECK: 113:9: UnexposedExpr=c:2:14 Extent=[113:9 - 113:10] // CHECK: 113:9: DeclRefExpr=c:2:14 Extent=[113:9 - 113:10] // CHECK: 113:14: UnexposedExpr= Extent=[113:14 - 113:19] // CHECK: 113:14: IntegerLiteral= Extent=[113:14 - 113:19] -// CHECK: 113:23: BinaryOperator= Extent=[113:23 - 113:33] +// CHECK: 113:23: BinaryOperator=<= Extent=[113:23 - 113:33] +// CHECK: 113:23: UnexposedExpr=c:2:14 Extent=[113:23 - 113:24] // CHECK: 113:23: DeclRefExpr=c:2:14 Extent=[113:23 - 113:24] // CHECK: 113:28: UnexposedExpr= Extent=[113:28 - 113:33] // CHECK: 113:28: IntegerLiteral= Extent=[113:28 - 113:33] -// CHECK: 114:8: BinaryOperator= Extent=[114:8 - 114:18] +// CHECK: 114:8: BinaryOperator=== Extent=[114:8 - 114:18] +// CHECK: 114:8: UnexposedExpr=c:2:14 Extent=[114:8 - 114:9] // CHECK: 114:8: DeclRefExpr=c:2:14 Extent=[114:8 - 114:9] // CHECK: 114:13: UnexposedExpr= Extent=[114:13 - 114:18] // CHECK: 114:13: IntegerLiteral= Extent=[114:13 - 114:18] -// CHECK: 114:23: BinaryOperator= Extent=[114:23 - 114:47] -// CHECK: 114:23: BinaryOperator= Extent=[114:23 - 114:33] +// CHECK: 114:22: ParenExpr= Extent=[114:22 - 114:48] +// CHECK: 114:23: BinaryOperator=&& Extent=[114:23 - 114:47] +// CHECK: 114:23: BinaryOperator=>= Extent=[114:23 - 114:33] +// CHECK: 114:23: UnexposedExpr=c:2:14 Extent=[114:23 - 114:24] // CHECK: 114:23: DeclRefExpr=c:2:14 Extent=[114:23 - 114:24] // CHECK: 114:28: UnexposedExpr= Extent=[114:28 - 114:33] // CHECK: 114:28: IntegerLiteral= Extent=[114:28 - 114:33] -// CHECK: 114:37: BinaryOperator= Extent=[114:37 - 114:47] +// CHECK: 114:37: BinaryOperator=<= Extent=[114:37 - 114:47] +// CHECK: 114:37: UnexposedExpr=c:2:14 Extent=[114:37 - 114:38] // CHECK: 114:37: DeclRefExpr=c:2:14 Extent=[114:37 - 114:38] // CHECK: 114:42: UnexposedExpr= Extent=[114:42 - 114:47] // CHECK: 114:42: IntegerLiteral= Extent=[114:42 - 114:47] -// CHECK: 115:8: BinaryOperator= Extent=[115:8 - 115:18] +// CHECK: 115:8: BinaryOperator=== Extent=[115:8 - 115:18] +// CHECK: 115:8: UnexposedExpr=c:2:14 Extent=[115:8 - 115:9] // CHECK: 115:8: DeclRefExpr=c:2:14 Extent=[115:8 - 115:9] // CHECK: 115:13: UnexposedExpr= Extent=[115:13 - 115:18] // CHECK: 115:13: IntegerLiteral= Extent=[115:13 - 115:18] -// CHECK: 115:23: BinaryOperator= Extent=[115:23 - 115:47] -// CHECK: 115:23: BinaryOperator= Extent=[115:23 - 115:33] +// CHECK: 115:22: ParenExpr= Extent=[115:22 - 115:48] +// CHECK: 115:23: BinaryOperator=&& Extent=[115:23 - 115:47] +// CHECK: 115:23: BinaryOperator=>= Extent=[115:23 - 115:33] +// CHECK: 115:23: UnexposedExpr=c:2:14 Extent=[115:23 - 115:24] // CHECK: 115:23: DeclRefExpr=c:2:14 Extent=[115:23 - 115:24] // CHECK: 115:28: UnexposedExpr= Extent=[115:28 - 115:33] // CHECK: 115:28: IntegerLiteral= Extent=[115:28 - 115:33] -// CHECK: 115:37: BinaryOperator= Extent=[115:37 - 115:47] +// CHECK: 115:37: BinaryOperator=<= Extent=[115:37 - 115:47] +// CHECK: 115:37: UnexposedExpr=c:2:14 Extent=[115:37 - 115:38] // CHECK: 115:37: DeclRefExpr=c:2:14 Extent=[115:37 - 115:38] // CHECK: 115:42: UnexposedExpr= Extent=[115:42 - 115:47] // CHECK: 115:42: IntegerLiteral= Extent=[115:42 - 115:47] -// CHECK: 116:9: BinaryOperator= Extent=[116:9 - 116:33] -// CHECK: 116:9: BinaryOperator= Extent=[116:9 - 116:19] +// CHECK: 116:8: ParenExpr= Extent=[116:8 - 116:34] +// CHECK: 116:9: BinaryOperator=&& Extent=[116:9 - 116:33] +// CHECK: 116:9: BinaryOperator=>= Extent=[116:9 - 116:19] +// CHECK: 116:9: UnexposedExpr=c:2:14 Extent=[116:9 - 116:10] // CHECK: 116:9: DeclRefExpr=c:2:14 Extent=[116:9 - 116:10] // CHECK: 116:14: UnexposedExpr= Extent=[116:14 - 116:19] // CHECK: 116:14: IntegerLiteral= Extent=[116:14 - 116:19] -// CHECK: 116:23: BinaryOperator= Extent=[116:23 - 116:33] +// CHECK: 116:23: BinaryOperator=<= Extent=[116:23 - 116:33] +// CHECK: 116:23: UnexposedExpr=c:2:14 Extent=[116:23 - 116:24] // CHECK: 116:23: DeclRefExpr=c:2:14 Extent=[116:23 - 116:24] // CHECK: 116:28: UnexposedExpr= Extent=[116:28 - 116:33] // CHECK: 116:28: IntegerLiteral= Extent=[116:28 - 116:33] -// CHECK: 117:9: BinaryOperator= Extent=[117:9 - 117:33] -// CHECK: 117:9: BinaryOperator= Extent=[117:9 - 117:19] +// CHECK: 117:8: ParenExpr= Extent=[117:8 - 117:34] +// CHECK: 117:9: BinaryOperator=&& Extent=[117:9 - 117:33] +// CHECK: 117:9: BinaryOperator=>= Extent=[117:9 - 117:19] +// CHECK: 117:9: UnexposedExpr=c:2:14 Extent=[117:9 - 117:10] // CHECK: 117:9: DeclRefExpr=c:2:14 Extent=[117:9 - 117:10] // CHECK: 117:14: UnexposedExpr= Extent=[117:14 - 117:19] // CHECK: 117:14: IntegerLiteral= Extent=[117:14 - 117:19] -// CHECK: 117:23: BinaryOperator= Extent=[117:23 - 117:33] +// CHECK: 117:23: BinaryOperator=<= Extent=[117:23 - 117:33] +// CHECK: 117:23: UnexposedExpr=c:2:14 Extent=[117:23 - 117:24] // CHECK: 117:23: DeclRefExpr=c:2:14 Extent=[117:23 - 117:24] // CHECK: 117:28: UnexposedExpr= Extent=[117:28 - 117:33] // CHECK: 117:28: IntegerLiteral= Extent=[117:28 - 117:33] -// CHECK: 118:9: BinaryOperator= Extent=[118:9 - 118:35] -// CHECK: 118:9: BinaryOperator= Extent=[118:9 - 118:20] +// CHECK: 118:8: ParenExpr= Extent=[118:8 - 118:36] +// CHECK: 118:9: BinaryOperator=&& Extent=[118:9 - 118:35] +// CHECK: 118:9: BinaryOperator=>= Extent=[118:9 - 118:20] +// CHECK: 118:9: UnexposedExpr=c:2:14 Extent=[118:9 - 118:10] // CHECK: 118:9: DeclRefExpr=c:2:14 Extent=[118:9 - 118:10] // CHECK: 118:14: UnexposedExpr= Extent=[118:14 - 118:20] // CHECK: 118:14: IntegerLiteral= Extent=[118:14 - 118:20] -// CHECK: 118:24: BinaryOperator= Extent=[118:24 - 118:35] +// CHECK: 118:24: BinaryOperator=<= Extent=[118:24 - 118:35] +// CHECK: 118:24: UnexposedExpr=c:2:14 Extent=[118:24 - 118:25] // CHECK: 118:24: DeclRefExpr=c:2:14 Extent=[118:24 - 118:25] // CHECK: 118:29: UnexposedExpr= Extent=[118:29 - 118:35] // CHECK: 118:29: IntegerLiteral= Extent=[118:29 - 118:35] -// CHECK: 119:9: BinaryOperator= Extent=[119:9 - 119:35] -// CHECK: 119:9: BinaryOperator= Extent=[119:9 - 119:20] +// CHECK: 119:8: ParenExpr= Extent=[119:8 - 119:36] +// CHECK: 119:9: BinaryOperator=&& Extent=[119:9 - 119:35] +// CHECK: 119:9: BinaryOperator=>= Extent=[119:9 - 119:20] +// CHECK: 119:9: UnexposedExpr=c:2:14 Extent=[119:9 - 119:10] // CHECK: 119:9: DeclRefExpr=c:2:14 Extent=[119:9 - 119:10] // CHECK: 119:14: UnexposedExpr= Extent=[119:14 - 119:20] // CHECK: 119:14: IntegerLiteral= Extent=[119:14 - 119:20] -// CHECK: 119:24: BinaryOperator= Extent=[119:24 - 119:35] +// CHECK: 119:24: BinaryOperator=<= Extent=[119:24 - 119:35] +// CHECK: 119:24: UnexposedExpr=c:2:14 Extent=[119:24 - 119:25] // CHECK: 119:24: DeclRefExpr=c:2:14 Extent=[119:24 - 119:25] // CHECK: 119:29: UnexposedExpr= Extent=[119:29 - 119:35] // CHECK: 119:29: IntegerLiteral= Extent=[119:29 - 119:35] -// CHECK: 120:8: BinaryOperator= Extent=[120:8 - 120:19] +// CHECK: 120:8: BinaryOperator=== Extent=[120:8 - 120:19] +// CHECK: 120:8: UnexposedExpr=c:2:14 Extent=[120:8 - 120:9] // CHECK: 120:8: DeclRefExpr=c:2:14 Extent=[120:8 - 120:9] // CHECK: 120:13: UnexposedExpr= Extent=[120:13 - 120:19] // CHECK: 120:13: IntegerLiteral= Extent=[120:13 - 120:19] -// CHECK: 120:24: BinaryOperator= Extent=[120:24 - 120:50] -// CHECK: 120:24: BinaryOperator= Extent=[120:24 - 120:35] +// CHECK: 120:23: ParenExpr= Extent=[120:23 - 120:51] +// CHECK: 120:24: BinaryOperator=&& Extent=[120:24 - 120:50] +// CHECK: 120:24: BinaryOperator=>= Extent=[120:24 - 120:35] +// CHECK: 120:24: UnexposedExpr=c:2:14 Extent=[120:24 - 120:25] // CHECK: 120:24: DeclRefExpr=c:2:14 Extent=[120:24 - 120:25] // CHECK: 120:29: UnexposedExpr= Extent=[120:29 - 120:35] // CHECK: 120:29: IntegerLiteral= Extent=[120:29 - 120:35] -// CHECK: 120:39: BinaryOperator= Extent=[120:39 - 120:50] +// CHECK: 120:39: BinaryOperator=<= Extent=[120:39 - 120:50] +// CHECK: 120:39: UnexposedExpr=c:2:14 Extent=[120:39 - 120:40] // CHECK: 120:39: DeclRefExpr=c:2:14 Extent=[120:39 - 120:40] // CHECK: 120:44: UnexposedExpr= Extent=[120:44 - 120:50] // CHECK: 120:44: IntegerLiteral= Extent=[120:44 - 120:50] -// CHECK: 121:9: BinaryOperator= Extent=[121:9 - 121:35] -// CHECK: 121:9: BinaryOperator= Extent=[121:9 - 121:20] +// CHECK: 121:8: ParenExpr= Extent=[121:8 - 121:36] +// CHECK: 121:9: BinaryOperator=&& Extent=[121:9 - 121:35] +// CHECK: 121:9: BinaryOperator=>= Extent=[121:9 - 121:20] +// CHECK: 121:9: UnexposedExpr=c:2:14 Extent=[121:9 - 121:10] // CHECK: 121:9: DeclRefExpr=c:2:14 Extent=[121:9 - 121:10] // CHECK: 121:14: UnexposedExpr= Extent=[121:14 - 121:20] // CHECK: 121:14: IntegerLiteral= Extent=[121:14 - 121:20] -// CHECK: 121:24: BinaryOperator= Extent=[121:24 - 121:35] +// CHECK: 121:24: BinaryOperator=<= Extent=[121:24 - 121:35] +// CHECK: 121:24: UnexposedExpr=c:2:14 Extent=[121:24 - 121:25] // CHECK: 121:24: DeclRefExpr=c:2:14 Extent=[121:24 - 121:25] // CHECK: 121:29: UnexposedExpr= Extent=[121:29 - 121:35] // CHECK: 121:29: IntegerLiteral= Extent=[121:29 - 121:35] -// CHECK: 122:8: BinaryOperator= Extent=[122:8 - 122:19] +// CHECK: 122:8: BinaryOperator=== Extent=[122:8 - 122:19] +// CHECK: 122:8: UnexposedExpr=c:2:14 Extent=[122:8 - 122:9] // CHECK: 122:8: DeclRefExpr=c:2:14 Extent=[122:8 - 122:9] // CHECK: 122:13: UnexposedExpr= Extent=[122:13 - 122:19] // CHECK: 122:13: IntegerLiteral= Extent=[122:13 - 122:19] -// CHECK: 122:24: BinaryOperator= Extent=[122:24 - 122:50] -// CHECK: 122:24: BinaryOperator= Extent=[122:24 - 122:35] +// CHECK: 122:23: ParenExpr= Extent=[122:23 - 122:51] +// CHECK: 122:24: BinaryOperator=&& Extent=[122:24 - 122:50] +// CHECK: 122:24: BinaryOperator=>= Extent=[122:24 - 122:35] +// CHECK: 122:24: UnexposedExpr=c:2:14 Extent=[122:24 - 122:25] // CHECK: 122:24: DeclRefExpr=c:2:14 Extent=[122:24 - 122:25] // CHECK: 122:29: UnexposedExpr= Extent=[122:29 - 122:35] // CHECK: 122:29: IntegerLiteral= Extent=[122:29 - 122:35] -// CHECK: 122:39: BinaryOperator= Extent=[122:39 - 122:50] +// CHECK: 122:39: BinaryOperator=<= Extent=[122:39 - 122:50] +// CHECK: 122:39: UnexposedExpr=c:2:14 Extent=[122:39 - 122:40] // CHECK: 122:39: DeclRefExpr=c:2:14 Extent=[122:39 - 122:40] // CHECK: 122:44: UnexposedExpr= Extent=[122:44 - 122:50] // CHECK: 122:44: IntegerLiteral= Extent=[122:44 - 122:50] -// CHECK: 123:9: BinaryOperator= Extent=[123:9 - 123:35] -// CHECK: 123:9: BinaryOperator= Extent=[123:9 - 123:20] +// CHECK: 123:8: ParenExpr= Extent=[123:8 - 123:36] +// CHECK: 123:9: BinaryOperator=&& Extent=[123:9 - 123:35] +// CHECK: 123:9: BinaryOperator=>= Extent=[123:9 - 123:20] +// CHECK: 123:9: UnexposedExpr=c:2:14 Extent=[123:9 - 123:10] // CHECK: 123:9: DeclRefExpr=c:2:14 Extent=[123:9 - 123:10] // CHECK: 123:14: UnexposedExpr= Extent=[123:14 - 123:20] // CHECK: 123:14: IntegerLiteral= Extent=[123:14 - 123:20] -// CHECK: 123:24: BinaryOperator= Extent=[123:24 - 123:35] +// CHECK: 123:24: BinaryOperator=<= Extent=[123:24 - 123:35] +// CHECK: 123:24: UnexposedExpr=c:2:14 Extent=[123:24 - 123:25] // CHECK: 123:24: DeclRefExpr=c:2:14 Extent=[123:24 - 123:25] // CHECK: 123:29: UnexposedExpr= Extent=[123:29 - 123:35] // CHECK: 123:29: IntegerLiteral= Extent=[123:29 - 123:35] -// CHECK: 124:8: BinaryOperator= Extent=[124:8 - 124:19] +// CHECK: 124:8: BinaryOperator=== Extent=[124:8 - 124:19] +// CHECK: 124:8: UnexposedExpr=c:2:14 Extent=[124:8 - 124:9] // CHECK: 124:8: DeclRefExpr=c:2:14 Extent=[124:8 - 124:9] // CHECK: 124:13: UnexposedExpr= Extent=[124:13 - 124:19] // CHECK: 124:13: IntegerLiteral= Extent=[124:13 - 124:19] -// CHECK: 124:23: BinaryOperator= Extent=[124:23 - 124:34] +// CHECK: 124:23: BinaryOperator=== Extent=[124:23 - 124:34] +// CHECK: 124:23: UnexposedExpr=c:2:14 Extent=[124:23 - 124:24] // CHECK: 124:23: DeclRefExpr=c:2:14 Extent=[124:23 - 124:24] // CHECK: 124:28: UnexposedExpr= Extent=[124:28 - 124:34] // CHECK: 124:28: IntegerLiteral= Extent=[124:28 - 124:34] -// CHECK: 124:38: BinaryOperator= Extent=[124:38 - 124:49] +// CHECK: 124:38: BinaryOperator=== Extent=[124:38 - 124:49] +// CHECK: 124:38: UnexposedExpr=c:2:14 Extent=[124:38 - 124:39] // CHECK: 124:38: DeclRefExpr=c:2:14 Extent=[124:38 - 124:39] // CHECK: 124:43: UnexposedExpr= Extent=[124:43 - 124:49] // CHECK: 124:43: IntegerLiteral= Extent=[124:43 - 124:49] -// CHECK: 124:53: BinaryOperator= Extent=[124:53 - 124:64] +// CHECK: 124:53: BinaryOperator=== Extent=[124:53 - 124:64] +// CHECK: 124:53: UnexposedExpr=c:2:14 Extent=[124:53 - 124:54] // CHECK: 124:53: DeclRefExpr=c:2:14 Extent=[124:53 - 124:54] // CHECK: 124:58: UnexposedExpr= Extent=[124:58 - 124:64] // CHECK: 124:58: IntegerLiteral= Extent=[124:58 - 124:64] -// CHECK: 125:5: BinaryOperator= Extent=[125:5 - 125:16] +// CHECK: 125:5: BinaryOperator=== Extent=[125:5 - 125:16] +// CHECK: 125:5: UnexposedExpr=c:2:14 Extent=[125:5 - 125:6] // CHECK: 125:5: DeclRefExpr=c:2:14 Extent=[125:5 - 125:6] // CHECK: 125:10: UnexposedExpr= Extent=[125:10 - 125:16] // CHECK: 125:10: IntegerLiteral= Extent=[125:10 - 125:16] -// CHECK: 125:20: BinaryOperator= Extent=[125:20 - 125:31] +// CHECK: 125:20: BinaryOperator=== Extent=[125:20 - 125:31] +// CHECK: 125:20: UnexposedExpr=c:2:14 Extent=[125:20 - 125:21] // CHECK: 125:20: DeclRefExpr=c:2:14 Extent=[125:20 - 125:21] // CHECK: 125:25: UnexposedExpr= Extent=[125:25 - 125:31] // CHECK: 125:25: IntegerLiteral= Extent=[125:25 - 125:31] -// CHECK: 125:36: BinaryOperator= Extent=[125:36 - 125:62] -// CHECK: 125:36: BinaryOperator= Extent=[125:36 - 125:47] +// CHECK: 125:35: ParenExpr= Extent=[125:35 - 125:63] +// CHECK: 125:36: BinaryOperator=&& Extent=[125:36 - 125:62] +// CHECK: 125:36: BinaryOperator=>= Extent=[125:36 - 125:47] +// CHECK: 125:36: UnexposedExpr=c:2:14 Extent=[125:36 - 125:37] // CHECK: 125:36: DeclRefExpr=c:2:14 Extent=[125:36 - 125:37] // CHECK: 125:41: UnexposedExpr= Extent=[125:41 - 125:47] // CHECK: 125:41: IntegerLiteral= Extent=[125:41 - 125:47] -// CHECK: 125:51: BinaryOperator= Extent=[125:51 - 125:62] +// CHECK: 125:51: BinaryOperator=<= Extent=[125:51 - 125:62] +// CHECK: 125:51: UnexposedExpr=c:2:14 Extent=[125:51 - 125:52] // CHECK: 125:51: DeclRefExpr=c:2:14 Extent=[125:51 - 125:52] // CHECK: 125:56: UnexposedExpr= Extent=[125:56 - 125:62] // CHECK: 125:56: IntegerLiteral= Extent=[125:56 - 125:62] -// CHECK: 126:8: BinaryOperator= Extent=[126:8 - 126:19] +// CHECK: 126:8: BinaryOperator=== Extent=[126:8 - 126:19] +// CHECK: 126:8: UnexposedExpr=c:2:14 Extent=[126:8 - 126:9] // CHECK: 126:8: DeclRefExpr=c:2:14 Extent=[126:8 - 126:9] // CHECK: 126:13: UnexposedExpr= Extent=[126:13 - 126:19] // CHECK: 126:13: IntegerLiteral= Extent=[126:13 - 126:19] -// CHECK: 126:24: BinaryOperator= Extent=[126:24 - 126:50] -// CHECK: 126:24: BinaryOperator= Extent=[126:24 - 126:35] +// CHECK: 126:23: ParenExpr= Extent=[126:23 - 126:51] +// CHECK: 126:24: BinaryOperator=&& Extent=[126:24 - 126:50] +// CHECK: 126:24: BinaryOperator=>= Extent=[126:24 - 126:35] +// CHECK: 126:24: UnexposedExpr=c:2:14 Extent=[126:24 - 126:25] // CHECK: 126:24: DeclRefExpr=c:2:14 Extent=[126:24 - 126:25] // CHECK: 126:29: UnexposedExpr= Extent=[126:29 - 126:35] // CHECK: 126:29: IntegerLiteral= Extent=[126:29 - 126:35] -// CHECK: 126:39: BinaryOperator= Extent=[126:39 - 126:50] +// CHECK: 126:39: BinaryOperator=<= Extent=[126:39 - 126:50] +// CHECK: 126:39: UnexposedExpr=c:2:14 Extent=[126:39 - 126:40] // CHECK: 126:39: DeclRefExpr=c:2:14 Extent=[126:39 - 126:40] // CHECK: 126:44: UnexposedExpr= Extent=[126:44 - 126:50] // CHECK: 126:44: IntegerLiteral= Extent=[126:44 - 126:50] -// CHECK: 127:8: BinaryOperator= Extent=[127:8 - 127:19] +// CHECK: 127:8: BinaryOperator=== Extent=[127:8 - 127:19] +// CHECK: 127:8: UnexposedExpr=c:2:14 Extent=[127:8 - 127:9] // CHECK: 127:8: DeclRefExpr=c:2:14 Extent=[127:8 - 127:9] // CHECK: 127:13: UnexposedExpr= Extent=[127:13 - 127:19] // CHECK: 127:13: IntegerLiteral= Extent=[127:13 - 127:19] -// CHECK: 127:23: BinaryOperator= Extent=[127:23 - 127:34] +// CHECK: 127:23: BinaryOperator=== Extent=[127:23 - 127:34] +// CHECK: 127:23: UnexposedExpr=c:2:14 Extent=[127:23 - 127:24] // CHECK: 127:23: DeclRefExpr=c:2:14 Extent=[127:23 - 127:24] // CHECK: 127:28: UnexposedExpr= Extent=[127:28 - 127:34] // CHECK: 127:28: IntegerLiteral= Extent=[127:28 - 127:34] -// CHECK: 127:38: BinaryOperator= Extent=[127:38 - 127:49] +// CHECK: 127:38: BinaryOperator=== Extent=[127:38 - 127:49] +// CHECK: 127:38: UnexposedExpr=c:2:14 Extent=[127:38 - 127:39] // CHECK: 127:38: DeclRefExpr=c:2:14 Extent=[127:38 - 127:39] // CHECK: 127:43: UnexposedExpr= Extent=[127:43 - 127:49] // CHECK: 127:43: IntegerLiteral= Extent=[127:43 - 127:49] -// CHECK: 127:53: BinaryOperator= Extent=[127:53 - 127:64] +// CHECK: 127:53: BinaryOperator=== Extent=[127:53 - 127:64] +// CHECK: 127:53: UnexposedExpr=c:2:14 Extent=[127:53 - 127:54] // CHECK: 127:53: DeclRefExpr=c:2:14 Extent=[127:53 - 127:54] // CHECK: 127:58: UnexposedExpr= Extent=[127:58 - 127:64] // CHECK: 127:58: IntegerLiteral= Extent=[127:58 - 127:64] -// CHECK: 128:6: BinaryOperator= Extent=[128:6 - 128:32] -// CHECK: 128:6: BinaryOperator= Extent=[128:6 - 128:17] +// CHECK: 128:5: ParenExpr= Extent=[128:5 - 128:33] +// CHECK: 128:6: BinaryOperator=&& Extent=[128:6 - 128:32] +// CHECK: 128:6: BinaryOperator=>= Extent=[128:6 - 128:17] +// CHECK: 128:6: UnexposedExpr=c:2:14 Extent=[128:6 - 128:7] // CHECK: 128:6: DeclRefExpr=c:2:14 Extent=[128:6 - 128:7] // CHECK: 128:11: UnexposedExpr= Extent=[128:11 - 128:17] // CHECK: 128:11: IntegerLiteral= Extent=[128:11 - 128:17] -// CHECK: 128:21: BinaryOperator= Extent=[128:21 - 128:32] +// CHECK: 128:21: BinaryOperator=<= Extent=[128:21 - 128:32] +// CHECK: 128:21: UnexposedExpr=c:2:14 Extent=[128:21 - 128:22] // CHECK: 128:21: DeclRefExpr=c:2:14 Extent=[128:21 - 128:22] // CHECK: 128:26: UnexposedExpr= Extent=[128:26 - 128:32] // CHECK: 128:26: IntegerLiteral= Extent=[128:26 - 128:32] -// CHECK: 129:9: BinaryOperator= Extent=[129:9 - 129:35] -// CHECK: 129:9: BinaryOperator= Extent=[129:9 - 129:20] +// CHECK: 129:8: ParenExpr= Extent=[129:8 - 129:36] +// CHECK: 129:9: BinaryOperator=&& Extent=[129:9 - 129:35] +// CHECK: 129:9: BinaryOperator=>= Extent=[129:9 - 129:20] +// CHECK: 129:9: UnexposedExpr=c:2:14 Extent=[129:9 - 129:10] // CHECK: 129:9: DeclRefExpr=c:2:14 Extent=[129:9 - 129:10] // CHECK: 129:14: UnexposedExpr= Extent=[129:14 - 129:20] // CHECK: 129:14: IntegerLiteral= Extent=[129:14 - 129:20] -// CHECK: 129:24: BinaryOperator= Extent=[129:24 - 129:35] +// CHECK: 129:24: BinaryOperator=<= Extent=[129:24 - 129:35] +// CHECK: 129:24: UnexposedExpr=c:2:14 Extent=[129:24 - 129:25] // CHECK: 129:24: DeclRefExpr=c:2:14 Extent=[129:24 - 129:25] // CHECK: 129:29: UnexposedExpr= Extent=[129:29 - 129:35] // CHECK: 129:29: IntegerLiteral= Extent=[129:29 - 129:35] -// CHECK: 130:8: BinaryOperator= Extent=[130:8 - 130:19] +// CHECK: 130:8: BinaryOperator=== Extent=[130:8 - 130:19] +// CHECK: 130:8: UnexposedExpr=c:2:14 Extent=[130:8 - 130:9] // CHECK: 130:8: DeclRefExpr=c:2:14 Extent=[130:8 - 130:9] // CHECK: 130:13: UnexposedExpr= Extent=[130:13 - 130:19] // CHECK: 130:13: IntegerLiteral= Extent=[130:13 - 130:19] -// CHECK: 130:23: BinaryOperator= Extent=[130:23 - 130:34] +// CHECK: 130:23: BinaryOperator=== Extent=[130:23 - 130:34] +// CHECK: 130:23: UnexposedExpr=c:2:14 Extent=[130:23 - 130:24] // CHECK: 130:23: DeclRefExpr=c:2:14 Extent=[130:23 - 130:24] // CHECK: 130:28: UnexposedExpr= Extent=[130:28 - 130:34] // CHECK: 130:28: IntegerLiteral= Extent=[130:28 - 130:34] -// CHECK: 130:38: BinaryOperator= Extent=[130:38 - 130:49] +// CHECK: 130:38: BinaryOperator=== Extent=[130:38 - 130:49] +// CHECK: 130:38: UnexposedExpr=c:2:14 Extent=[130:38 - 130:39] // CHECK: 130:38: DeclRefExpr=c:2:14 Extent=[130:38 - 130:39] // CHECK: 130:43: UnexposedExpr= Extent=[130:43 - 130:49] // CHECK: 130:43: IntegerLiteral= Extent=[130:43 - 130:49] -// CHECK: 130:53: BinaryOperator= Extent=[130:53 - 130:64] +// CHECK: 130:53: BinaryOperator=== Extent=[130:53 - 130:64] +// CHECK: 130:53: UnexposedExpr=c:2:14 Extent=[130:53 - 130:54] // CHECK: 130:53: DeclRefExpr=c:2:14 Extent=[130:53 - 130:54] // CHECK: 130:58: UnexposedExpr= Extent=[130:58 - 130:64] // CHECK: 130:58: IntegerLiteral= Extent=[130:58 - 130:64] -// CHECK: 131:6: BinaryOperator= Extent=[131:6 - 131:32] -// CHECK: 131:6: BinaryOperator= Extent=[131:6 - 131:17] +// CHECK: 131:5: ParenExpr= Extent=[131:5 - 131:33] +// CHECK: 131:6: BinaryOperator=&& Extent=[131:6 - 131:32] +// CHECK: 131:6: BinaryOperator=>= Extent=[131:6 - 131:17] +// CHECK: 131:6: UnexposedExpr=c:2:14 Extent=[131:6 - 131:7] // CHECK: 131:6: DeclRefExpr=c:2:14 Extent=[131:6 - 131:7] // CHECK: 131:11: UnexposedExpr= Extent=[131:11 - 131:17] // CHECK: 131:11: IntegerLiteral= Extent=[131:11 - 131:17] -// CHECK: 131:21: BinaryOperator= Extent=[131:21 - 131:32] +// CHECK: 131:21: BinaryOperator=<= Extent=[131:21 - 131:32] +// CHECK: 131:21: UnexposedExpr=c:2:14 Extent=[131:21 - 131:22] // CHECK: 131:21: DeclRefExpr=c:2:14 Extent=[131:21 - 131:22] // CHECK: 131:26: UnexposedExpr= Extent=[131:26 - 131:32] // CHECK: 131:26: IntegerLiteral= Extent=[131:26 - 131:32] -// CHECK: 132:9: BinaryOperator= Extent=[132:9 - 132:35] -// CHECK: 132:9: BinaryOperator= Extent=[132:9 - 132:20] +// CHECK: 132:8: ParenExpr= Extent=[132:8 - 132:36] +// CHECK: 132:9: BinaryOperator=&& Extent=[132:9 - 132:35] +// CHECK: 132:9: BinaryOperator=>= Extent=[132:9 - 132:20] +// CHECK: 132:9: UnexposedExpr=c:2:14 Extent=[132:9 - 132:10] // CHECK: 132:9: DeclRefExpr=c:2:14 Extent=[132:9 - 132:10] // CHECK: 132:14: UnexposedExpr= Extent=[132:14 - 132:20] // CHECK: 132:14: IntegerLiteral= Extent=[132:14 - 132:20] -// CHECK: 132:24: BinaryOperator= Extent=[132:24 - 132:35] +// CHECK: 132:24: BinaryOperator=<= Extent=[132:24 - 132:35] +// CHECK: 132:24: UnexposedExpr=c:2:14 Extent=[132:24 - 132:25] // CHECK: 132:24: DeclRefExpr=c:2:14 Extent=[132:24 - 132:25] // CHECK: 132:29: UnexposedExpr= Extent=[132:29 - 132:35] // CHECK: 132:29: IntegerLiteral= Extent=[132:29 - 132:35] -// CHECK: 133:8: BinaryOperator= Extent=[133:8 - 133:19] +// CHECK: 133:8: BinaryOperator=== Extent=[133:8 - 133:19] +// CHECK: 133:8: UnexposedExpr=c:2:14 Extent=[133:8 - 133:9] // CHECK: 133:8: DeclRefExpr=c:2:14 Extent=[133:8 - 133:9] // CHECK: 133:13: UnexposedExpr= Extent=[133:13 - 133:19] // CHECK: 133:13: IntegerLiteral= Extent=[133:13 - 133:19] -// CHECK: 133:24: BinaryOperator= Extent=[133:24 - 133:50] -// CHECK: 133:24: BinaryOperator= Extent=[133:24 - 133:35] +// CHECK: 133:23: ParenExpr= Extent=[133:23 - 133:51] +// CHECK: 133:24: BinaryOperator=&& Extent=[133:24 - 133:50] +// CHECK: 133:24: BinaryOperator=>= Extent=[133:24 - 133:35] +// CHECK: 133:24: UnexposedExpr=c:2:14 Extent=[133:24 - 133:25] // CHECK: 133:24: DeclRefExpr=c:2:14 Extent=[133:24 - 133:25] // CHECK: 133:29: UnexposedExpr= Extent=[133:29 - 133:35] // CHECK: 133:29: IntegerLiteral= Extent=[133:29 - 133:35] -// CHECK: 133:39: BinaryOperator= Extent=[133:39 - 133:50] +// CHECK: 133:39: BinaryOperator=<= Extent=[133:39 - 133:50] +// CHECK: 133:39: UnexposedExpr=c:2:14 Extent=[133:39 - 133:40] // CHECK: 133:39: DeclRefExpr=c:2:14 Extent=[133:39 - 133:40] // CHECK: 133:44: UnexposedExpr= Extent=[133:44 - 133:50] // CHECK: 133:44: IntegerLiteral= Extent=[133:44 - 133:50] -// CHECK: 134:8: BinaryOperator= Extent=[134:8 - 134:19] +// CHECK: 134:8: BinaryOperator=== Extent=[134:8 - 134:19] +// CHECK: 134:8: UnexposedExpr=c:2:14 Extent=[134:8 - 134:9] // CHECK: 134:8: DeclRefExpr=c:2:14 Extent=[134:8 - 134:9] // CHECK: 134:13: UnexposedExpr= Extent=[134:13 - 134:19] // CHECK: 134:13: IntegerLiteral= Extent=[134:13 - 134:19] -// CHECK: 134:23: BinaryOperator= Extent=[134:23 - 134:34] +// CHECK: 134:23: BinaryOperator=== Extent=[134:23 - 134:34] +// CHECK: 134:23: UnexposedExpr=c:2:14 Extent=[134:23 - 134:24] // CHECK: 134:23: DeclRefExpr=c:2:14 Extent=[134:23 - 134:24] // CHECK: 134:28: UnexposedExpr= Extent=[134:28 - 134:34] // CHECK: 134:28: IntegerLiteral= Extent=[134:28 - 134:34] -// CHECK: 134:38: BinaryOperator= Extent=[134:38 - 134:49] +// CHECK: 134:38: BinaryOperator=== Extent=[134:38 - 134:49] +// CHECK: 134:38: UnexposedExpr=c:2:14 Extent=[134:38 - 134:39] // CHECK: 134:38: DeclRefExpr=c:2:14 Extent=[134:38 - 134:39] // CHECK: 134:43: UnexposedExpr= Extent=[134:43 - 134:49] // CHECK: 134:43: IntegerLiteral= Extent=[134:43 - 134:49] -// CHECK: 134:54: BinaryOperator= Extent=[134:54 - 134:80] -// CHECK: 134:54: BinaryOperator= Extent=[134:54 - 134:65] +// CHECK: 134:53: ParenExpr= Extent=[134:53 - 134:81] +// CHECK: 134:54: BinaryOperator=&& Extent=[134:54 - 134:80] +// CHECK: 134:54: BinaryOperator=>= Extent=[134:54 - 134:65] +// CHECK: 134:54: UnexposedExpr=c:2:14 Extent=[134:54 - 134:55] // CHECK: 134:54: DeclRefExpr=c:2:14 Extent=[134:54 - 134:55] // CHECK: 134:59: UnexposedExpr= Extent=[134:59 - 134:65] // CHECK: 134:59: IntegerLiteral= Extent=[134:59 - 134:65] -// CHECK: 134:69: BinaryOperator= Extent=[134:69 - 134:80] +// CHECK: 134:69: BinaryOperator=<= Extent=[134:69 - 134:80] +// CHECK: 134:69: UnexposedExpr=c:2:14 Extent=[134:69 - 134:70] // CHECK: 134:69: DeclRefExpr=c:2:14 Extent=[134:69 - 134:70] // CHECK: 134:74: UnexposedExpr= Extent=[134:74 - 134:80] // CHECK: 134:74: IntegerLiteral= Extent=[134:74 - 134:80] -// CHECK: 135:9: BinaryOperator= Extent=[135:9 - 135:35] -// CHECK: 135:9: BinaryOperator= Extent=[135:9 - 135:20] +// CHECK: 135:8: ParenExpr= Extent=[135:8 - 135:36] +// CHECK: 135:9: BinaryOperator=&& Extent=[135:9 - 135:35] +// CHECK: 135:9: BinaryOperator=>= Extent=[135:9 - 135:20] +// CHECK: 135:9: UnexposedExpr=c:2:14 Extent=[135:9 - 135:10] // CHECK: 135:9: DeclRefExpr=c:2:14 Extent=[135:9 - 135:10] // CHECK: 135:14: UnexposedExpr= Extent=[135:14 - 135:20] // CHECK: 135:14: IntegerLiteral= Extent=[135:14 - 135:20] -// CHECK: 135:24: BinaryOperator= Extent=[135:24 - 135:35] +// CHECK: 135:24: BinaryOperator=<= Extent=[135:24 - 135:35] +// CHECK: 135:24: UnexposedExpr=c:2:14 Extent=[135:24 - 135:25] // CHECK: 135:24: DeclRefExpr=c:2:14 Extent=[135:24 - 135:25] // CHECK: 135:29: UnexposedExpr= Extent=[135:29 - 135:35] // CHECK: 135:29: IntegerLiteral= Extent=[135:29 - 135:35] -// CHECK: 136:9: BinaryOperator= Extent=[136:9 - 136:35] -// CHECK: 136:9: BinaryOperator= Extent=[136:9 - 136:20] +// CHECK: 136:8: ParenExpr= Extent=[136:8 - 136:36] +// CHECK: 136:9: BinaryOperator=&& Extent=[136:9 - 136:35] +// CHECK: 136:9: BinaryOperator=>= Extent=[136:9 - 136:20] +// CHECK: 136:9: UnexposedExpr=c:2:14 Extent=[136:9 - 136:10] // CHECK: 136:9: DeclRefExpr=c:2:14 Extent=[136:9 - 136:10] // CHECK: 136:14: UnexposedExpr= Extent=[136:14 - 136:20] // CHECK: 136:14: IntegerLiteral= Extent=[136:14 - 136:20] -// CHECK: 136:24: BinaryOperator= Extent=[136:24 - 136:35] +// CHECK: 136:24: BinaryOperator=<= Extent=[136:24 - 136:35] +// CHECK: 136:24: UnexposedExpr=c:2:14 Extent=[136:24 - 136:25] // CHECK: 136:24: DeclRefExpr=c:2:14 Extent=[136:24 - 136:25] // CHECK: 136:29: UnexposedExpr= Extent=[136:29 - 136:35] // CHECK: 136:29: IntegerLiteral= Extent=[136:29 - 136:35] -// CHECK: 137:9: BinaryOperator= Extent=[137:9 - 137:35] -// CHECK: 137:9: BinaryOperator= Extent=[137:9 - 137:20] +// CHECK: 137:8: ParenExpr= Extent=[137:8 - 137:36] +// CHECK: 137:9: BinaryOperator=&& Extent=[137:9 - 137:35] +// CHECK: 137:9: BinaryOperator=>= Extent=[137:9 - 137:20] +// CHECK: 137:9: UnexposedExpr=c:2:14 Extent=[137:9 - 137:10] // CHECK: 137:9: DeclRefExpr=c:2:14 Extent=[137:9 - 137:10] // CHECK: 137:14: UnexposedExpr= Extent=[137:14 - 137:20] // CHECK: 137:14: IntegerLiteral= Extent=[137:14 - 137:20] -// CHECK: 137:24: BinaryOperator= Extent=[137:24 - 137:35] +// CHECK: 137:24: BinaryOperator=<= Extent=[137:24 - 137:35] +// CHECK: 137:24: UnexposedExpr=c:2:14 Extent=[137:24 - 137:25] // CHECK: 137:24: DeclRefExpr=c:2:14 Extent=[137:24 - 137:25] // CHECK: 137:29: UnexposedExpr= Extent=[137:29 - 137:35] // CHECK: 137:29: IntegerLiteral= Extent=[137:29 - 137:35] -// CHECK: 138:9: BinaryOperator= Extent=[138:9 - 138:35] -// CHECK: 138:9: BinaryOperator= Extent=[138:9 - 138:20] +// CHECK: 138:8: ParenExpr= Extent=[138:8 - 138:36] +// CHECK: 138:9: BinaryOperator=&& Extent=[138:9 - 138:35] +// CHECK: 138:9: BinaryOperator=>= Extent=[138:9 - 138:20] +// CHECK: 138:9: UnexposedExpr=c:2:14 Extent=[138:9 - 138:10] // CHECK: 138:9: DeclRefExpr=c:2:14 Extent=[138:9 - 138:10] // CHECK: 138:14: UnexposedExpr= Extent=[138:14 - 138:20] // CHECK: 138:14: IntegerLiteral= Extent=[138:14 - 138:20] -// CHECK: 138:24: BinaryOperator= Extent=[138:24 - 138:35] +// CHECK: 138:24: BinaryOperator=<= Extent=[138:24 - 138:35] +// CHECK: 138:24: UnexposedExpr=c:2:14 Extent=[138:24 - 138:25] // CHECK: 138:24: DeclRefExpr=c:2:14 Extent=[138:24 - 138:25] // CHECK: 138:29: UnexposedExpr= Extent=[138:29 - 138:35] // CHECK: 138:29: IntegerLiteral= Extent=[138:29 - 138:35] -// CHECK: 139:9: BinaryOperator= Extent=[139:9 - 139:35] -// CHECK: 139:9: BinaryOperator= Extent=[139:9 - 139:20] +// CHECK: 139:8: ParenExpr= Extent=[139:8 - 139:36] +// CHECK: 139:9: BinaryOperator=&& Extent=[139:9 - 139:35] +// CHECK: 139:9: BinaryOperator=>= Extent=[139:9 - 139:20] +// CHECK: 139:9: UnexposedExpr=c:2:14 Extent=[139:9 - 139:10] // CHECK: 139:9: DeclRefExpr=c:2:14 Extent=[139:9 - 139:10] // CHECK: 139:14: UnexposedExpr= Extent=[139:14 - 139:20] // CHECK: 139:14: IntegerLiteral= Extent=[139:14 - 139:20] -// CHECK: 139:24: BinaryOperator= Extent=[139:24 - 139:35] +// CHECK: 139:24: BinaryOperator=<= Extent=[139:24 - 139:35] +// CHECK: 139:24: UnexposedExpr=c:2:14 Extent=[139:24 - 139:25] // CHECK: 139:24: DeclRefExpr=c:2:14 Extent=[139:24 - 139:25] // CHECK: 139:29: UnexposedExpr= Extent=[139:29 - 139:35] // CHECK: 139:29: IntegerLiteral= Extent=[139:29 - 139:35] -// CHECK: 140:9: BinaryOperator= Extent=[140:9 - 140:35] -// CHECK: 140:9: BinaryOperator= Extent=[140:9 - 140:20] +// CHECK: 140:8: ParenExpr= Extent=[140:8 - 140:36] +// CHECK: 140:9: BinaryOperator=&& Extent=[140:9 - 140:35] +// CHECK: 140:9: BinaryOperator=>= Extent=[140:9 - 140:20] +// CHECK: 140:9: UnexposedExpr=c:2:14 Extent=[140:9 - 140:10] // CHECK: 140:9: DeclRefExpr=c:2:14 Extent=[140:9 - 140:10] // CHECK: 140:14: UnexposedExpr= Extent=[140:14 - 140:20] // CHECK: 140:14: IntegerLiteral= Extent=[140:14 - 140:20] -// CHECK: 140:24: BinaryOperator= Extent=[140:24 - 140:35] +// CHECK: 140:24: BinaryOperator=<= Extent=[140:24 - 140:35] +// CHECK: 140:24: UnexposedExpr=c:2:14 Extent=[140:24 - 140:25] // CHECK: 140:24: DeclRefExpr=c:2:14 Extent=[140:24 - 140:25] // CHECK: 140:29: UnexposedExpr= Extent=[140:29 - 140:35] // CHECK: 140:29: IntegerLiteral= Extent=[140:29 - 140:35] -// CHECK: 141:8: BinaryOperator= Extent=[141:8 - 141:19] +// CHECK: 141:8: BinaryOperator=== Extent=[141:8 - 141:19] +// CHECK: 141:8: UnexposedExpr=c:2:14 Extent=[141:8 - 141:9] // CHECK: 141:8: DeclRefExpr=c:2:14 Extent=[141:8 - 141:9] // CHECK: 141:13: UnexposedExpr= Extent=[141:13 - 141:19] // CHECK: 141:13: IntegerLiteral= Extent=[141:13 - 141:19] -// CHECK: 141:23: BinaryOperator= Extent=[141:23 - 141:34] +// CHECK: 141:23: BinaryOperator=== Extent=[141:23 - 141:34] +// CHECK: 141:23: UnexposedExpr=c:2:14 Extent=[141:23 - 141:24] // CHECK: 141:23: DeclRefExpr=c:2:14 Extent=[141:23 - 141:24] // CHECK: 141:28: UnexposedExpr= Extent=[141:28 - 141:34] // CHECK: 141:28: IntegerLiteral= Extent=[141:28 - 141:34] -// CHECK: 141:38: BinaryOperator= Extent=[141:38 - 141:49] +// CHECK: 141:38: BinaryOperator=== Extent=[141:38 - 141:49] +// CHECK: 141:38: UnexposedExpr=c:2:14 Extent=[141:38 - 141:39] // CHECK: 141:38: DeclRefExpr=c:2:14 Extent=[141:38 - 141:39] // CHECK: 141:43: UnexposedExpr= Extent=[141:43 - 141:49] // CHECK: 141:43: IntegerLiteral= Extent=[141:43 - 141:49] -// CHECK: 141:54: BinaryOperator= Extent=[141:54 - 141:80] -// CHECK: 141:54: BinaryOperator= Extent=[141:54 - 141:65] +// CHECK: 141:53: ParenExpr= Extent=[141:53 - 141:81] +// CHECK: 141:54: BinaryOperator=&& Extent=[141:54 - 141:80] +// CHECK: 141:54: BinaryOperator=>= Extent=[141:54 - 141:65] +// CHECK: 141:54: UnexposedExpr=c:2:14 Extent=[141:54 - 141:55] // CHECK: 141:54: DeclRefExpr=c:2:14 Extent=[141:54 - 141:55] // CHECK: 141:59: UnexposedExpr= Extent=[141:59 - 141:65] // CHECK: 141:59: IntegerLiteral= Extent=[141:59 - 141:65] -// CHECK: 141:69: BinaryOperator= Extent=[141:69 - 141:80] +// CHECK: 141:69: BinaryOperator=<= Extent=[141:69 - 141:80] +// CHECK: 141:69: UnexposedExpr=c:2:14 Extent=[141:69 - 141:70] // CHECK: 141:69: DeclRefExpr=c:2:14 Extent=[141:69 - 141:70] // CHECK: 141:74: UnexposedExpr= Extent=[141:74 - 141:80] // CHECK: 141:74: IntegerLiteral= Extent=[141:74 - 141:80] -// CHECK: 142:9: BinaryOperator= Extent=[142:9 - 142:35] -// CHECK: 142:9: BinaryOperator= Extent=[142:9 - 142:20] +// CHECK: 142:8: ParenExpr= Extent=[142:8 - 142:36] +// CHECK: 142:9: BinaryOperator=&& Extent=[142:9 - 142:35] +// CHECK: 142:9: BinaryOperator=>= Extent=[142:9 - 142:20] +// CHECK: 142:9: UnexposedExpr=c:2:14 Extent=[142:9 - 142:10] // CHECK: 142:9: DeclRefExpr=c:2:14 Extent=[142:9 - 142:10] // CHECK: 142:14: UnexposedExpr= Extent=[142:14 - 142:20] // CHECK: 142:14: IntegerLiteral= Extent=[142:14 - 142:20] -// CHECK: 142:24: BinaryOperator= Extent=[142:24 - 142:35] +// CHECK: 142:24: BinaryOperator=<= Extent=[142:24 - 142:35] +// CHECK: 142:24: UnexposedExpr=c:2:14 Extent=[142:24 - 142:25] // CHECK: 142:24: DeclRefExpr=c:2:14 Extent=[142:24 - 142:25] // CHECK: 142:29: UnexposedExpr= Extent=[142:29 - 142:35] // CHECK: 142:29: IntegerLiteral= Extent=[142:29 - 142:35] -// CHECK: 143:9: BinaryOperator= Extent=[143:9 - 143:35] -// CHECK: 143:9: BinaryOperator= Extent=[143:9 - 143:20] +// CHECK: 143:8: ParenExpr= Extent=[143:8 - 143:36] +// CHECK: 143:9: BinaryOperator=&& Extent=[143:9 - 143:35] +// CHECK: 143:9: BinaryOperator=>= Extent=[143:9 - 143:20] +// CHECK: 143:9: UnexposedExpr=c:2:14 Extent=[143:9 - 143:10] // CHECK: 143:9: DeclRefExpr=c:2:14 Extent=[143:9 - 143:10] // CHECK: 143:14: UnexposedExpr= Extent=[143:14 - 143:20] // CHECK: 143:14: IntegerLiteral= Extent=[143:14 - 143:20] -// CHECK: 143:24: BinaryOperator= Extent=[143:24 - 143:35] +// CHECK: 143:24: BinaryOperator=<= Extent=[143:24 - 143:35] +// CHECK: 143:24: UnexposedExpr=c:2:14 Extent=[143:24 - 143:25] // CHECK: 143:24: DeclRefExpr=c:2:14 Extent=[143:24 - 143:25] // CHECK: 143:29: UnexposedExpr= Extent=[143:29 - 143:35] // CHECK: 143:29: IntegerLiteral= Extent=[143:29 - 143:35] -// CHECK: 144:8: BinaryOperator= Extent=[144:8 - 144:19] +// CHECK: 144:8: BinaryOperator=== Extent=[144:8 - 144:19] +// CHECK: 144:8: UnexposedExpr=c:2:14 Extent=[144:8 - 144:9] // CHECK: 144:8: DeclRefExpr=c:2:14 Extent=[144:8 - 144:9] // CHECK: 144:13: UnexposedExpr= Extent=[144:13 - 144:19] // CHECK: 144:13: IntegerLiteral= Extent=[144:13 - 144:19] -// CHECK: 144:24: BinaryOperator= Extent=[144:24 - 144:50] -// CHECK: 144:24: BinaryOperator= Extent=[144:24 - 144:35] +// CHECK: 144:23: ParenExpr= Extent=[144:23 - 144:51] +// CHECK: 144:24: BinaryOperator=&& Extent=[144:24 - 144:50] +// CHECK: 144:24: BinaryOperator=>= Extent=[144:24 - 144:35] +// CHECK: 144:24: UnexposedExpr=c:2:14 Extent=[144:24 - 144:25] // CHECK: 144:24: DeclRefExpr=c:2:14 Extent=[144:24 - 144:25] // CHECK: 144:29: UnexposedExpr= Extent=[144:29 - 144:35] // CHECK: 144:29: IntegerLiteral= Extent=[144:29 - 144:35] -// CHECK: 144:39: BinaryOperator= Extent=[144:39 - 144:50] +// CHECK: 144:39: BinaryOperator=<= Extent=[144:39 - 144:50] +// CHECK: 144:39: UnexposedExpr=c:2:14 Extent=[144:39 - 144:40] // CHECK: 144:39: DeclRefExpr=c:2:14 Extent=[144:39 - 144:40] // CHECK: 144:44: UnexposedExpr= Extent=[144:44 - 144:50] // CHECK: 144:44: IntegerLiteral= Extent=[144:44 - 144:50] -// CHECK: 145:9: BinaryOperator= Extent=[145:9 - 145:35] -// CHECK: 145:9: BinaryOperator= Extent=[145:9 - 145:20] +// CHECK: 145:8: ParenExpr= Extent=[145:8 - 145:36] +// CHECK: 145:9: BinaryOperator=&& Extent=[145:9 - 145:35] +// CHECK: 145:9: BinaryOperator=>= Extent=[145:9 - 145:20] +// CHECK: 145:9: UnexposedExpr=c:2:14 Extent=[145:9 - 145:10] // CHECK: 145:9: DeclRefExpr=c:2:14 Extent=[145:9 - 145:10] // CHECK: 145:14: UnexposedExpr= Extent=[145:14 - 145:20] // CHECK: 145:14: IntegerLiteral= Extent=[145:14 - 145:20] -// CHECK: 145:24: BinaryOperator= Extent=[145:24 - 145:35] +// CHECK: 145:24: BinaryOperator=<= Extent=[145:24 - 145:35] +// CHECK: 145:24: UnexposedExpr=c:2:14 Extent=[145:24 - 145:25] // CHECK: 145:24: DeclRefExpr=c:2:14 Extent=[145:24 - 145:25] // CHECK: 145:29: UnexposedExpr= Extent=[145:29 - 145:35] // CHECK: 145:29: IntegerLiteral= Extent=[145:29 - 145:35] -// CHECK: 146:9: BinaryOperator= Extent=[146:9 - 146:35] -// CHECK: 146:9: BinaryOperator= Extent=[146:9 - 146:20] +// CHECK: 146:8: ParenExpr= Extent=[146:8 - 146:36] +// CHECK: 146:9: BinaryOperator=&& Extent=[146:9 - 146:35] +// CHECK: 146:9: BinaryOperator=>= Extent=[146:9 - 146:20] +// CHECK: 146:9: UnexposedExpr=c:2:14 Extent=[146:9 - 146:10] // CHECK: 146:9: DeclRefExpr=c:2:14 Extent=[146:9 - 146:10] // CHECK: 146:14: UnexposedExpr= Extent=[146:14 - 146:20] // CHECK: 146:14: IntegerLiteral= Extent=[146:14 - 146:20] -// CHECK: 146:24: BinaryOperator= Extent=[146:24 - 146:35] +// CHECK: 146:24: BinaryOperator=<= Extent=[146:24 - 146:35] +// CHECK: 146:24: UnexposedExpr=c:2:14 Extent=[146:24 - 146:25] // CHECK: 146:24: DeclRefExpr=c:2:14 Extent=[146:24 - 146:25] // CHECK: 146:29: UnexposedExpr= Extent=[146:29 - 146:35] // CHECK: 146:29: IntegerLiteral= Extent=[146:29 - 146:35] -// CHECK: 147:9: BinaryOperator= Extent=[147:9 - 147:35] -// CHECK: 147:9: BinaryOperator= Extent=[147:9 - 147:20] +// CHECK: 147:8: ParenExpr= Extent=[147:8 - 147:36] +// CHECK: 147:9: BinaryOperator=&& Extent=[147:9 - 147:35] +// CHECK: 147:9: BinaryOperator=>= Extent=[147:9 - 147:20] +// CHECK: 147:9: UnexposedExpr=c:2:14 Extent=[147:9 - 147:10] // CHECK: 147:9: DeclRefExpr=c:2:14 Extent=[147:9 - 147:10] // CHECK: 147:14: UnexposedExpr= Extent=[147:14 - 147:20] // CHECK: 147:14: IntegerLiteral= Extent=[147:14 - 147:20] -// CHECK: 147:24: BinaryOperator= Extent=[147:24 - 147:35] +// CHECK: 147:24: BinaryOperator=<= Extent=[147:24 - 147:35] +// CHECK: 147:24: UnexposedExpr=c:2:14 Extent=[147:24 - 147:25] // CHECK: 147:24: DeclRefExpr=c:2:14 Extent=[147:24 - 147:25] // CHECK: 147:29: UnexposedExpr= Extent=[147:29 - 147:35] // CHECK: 147:29: IntegerLiteral= Extent=[147:29 - 147:35] -// CHECK: 148:9: BinaryOperator= Extent=[148:9 - 148:35] -// CHECK: 148:9: BinaryOperator= Extent=[148:9 - 148:20] +// CHECK: 148:8: ParenExpr= Extent=[148:8 - 148:36] +// CHECK: 148:9: BinaryOperator=&& Extent=[148:9 - 148:35] +// CHECK: 148:9: BinaryOperator=>= Extent=[148:9 - 148:20] +// CHECK: 148:9: UnexposedExpr=c:2:14 Extent=[148:9 - 148:10] // CHECK: 148:9: DeclRefExpr=c:2:14 Extent=[148:9 - 148:10] // CHECK: 148:14: UnexposedExpr= Extent=[148:14 - 148:20] // CHECK: 148:14: IntegerLiteral= Extent=[148:14 - 148:20] -// CHECK: 148:24: BinaryOperator= Extent=[148:24 - 148:35] +// CHECK: 148:24: BinaryOperator=<= Extent=[148:24 - 148:35] +// CHECK: 148:24: UnexposedExpr=c:2:14 Extent=[148:24 - 148:25] // CHECK: 148:24: DeclRefExpr=c:2:14 Extent=[148:24 - 148:25] // CHECK: 148:29: UnexposedExpr= Extent=[148:29 - 148:35] // CHECK: 148:29: IntegerLiteral= Extent=[148:29 - 148:35] -// CHECK: 149:9: BinaryOperator= Extent=[149:9 - 149:35] -// CHECK: 149:9: BinaryOperator= Extent=[149:9 - 149:20] +// CHECK: 149:8: ParenExpr= Extent=[149:8 - 149:36] +// CHECK: 149:9: BinaryOperator=&& Extent=[149:9 - 149:35] +// CHECK: 149:9: BinaryOperator=>= Extent=[149:9 - 149:20] +// CHECK: 149:9: UnexposedExpr=c:2:14 Extent=[149:9 - 149:10] // CHECK: 149:9: DeclRefExpr=c:2:14 Extent=[149:9 - 149:10] // CHECK: 149:14: UnexposedExpr= Extent=[149:14 - 149:20] // CHECK: 149:14: IntegerLiteral= Extent=[149:14 - 149:20] -// CHECK: 149:24: BinaryOperator= Extent=[149:24 - 149:35] +// CHECK: 149:24: BinaryOperator=<= Extent=[149:24 - 149:35] +// CHECK: 149:24: UnexposedExpr=c:2:14 Extent=[149:24 - 149:25] // CHECK: 149:24: DeclRefExpr=c:2:14 Extent=[149:24 - 149:25] // CHECK: 149:29: UnexposedExpr= Extent=[149:29 - 149:35] // CHECK: 149:29: IntegerLiteral= Extent=[149:29 - 149:35] -// CHECK: 150:9: BinaryOperator= Extent=[150:9 - 150:35] -// CHECK: 150:9: BinaryOperator= Extent=[150:9 - 150:20] +// CHECK: 150:8: ParenExpr= Extent=[150:8 - 150:36] +// CHECK: 150:9: BinaryOperator=&& Extent=[150:9 - 150:35] +// CHECK: 150:9: BinaryOperator=>= Extent=[150:9 - 150:20] +// CHECK: 150:9: UnexposedExpr=c:2:14 Extent=[150:9 - 150:10] // CHECK: 150:9: DeclRefExpr=c:2:14 Extent=[150:9 - 150:10] // CHECK: 150:14: UnexposedExpr= Extent=[150:14 - 150:20] // CHECK: 150:14: IntegerLiteral= Extent=[150:14 - 150:20] -// CHECK: 150:24: BinaryOperator= Extent=[150:24 - 150:35] +// CHECK: 150:24: BinaryOperator=<= Extent=[150:24 - 150:35] +// CHECK: 150:24: UnexposedExpr=c:2:14 Extent=[150:24 - 150:25] // CHECK: 150:24: DeclRefExpr=c:2:14 Extent=[150:24 - 150:25] // CHECK: 150:29: UnexposedExpr= Extent=[150:29 - 150:35] // CHECK: 150:29: IntegerLiteral= Extent=[150:29 - 150:35] -// CHECK: 151:8: BinaryOperator= Extent=[151:8 - 151:19] +// CHECK: 151:8: BinaryOperator=== Extent=[151:8 - 151:19] +// CHECK: 151:8: UnexposedExpr=c:2:14 Extent=[151:8 - 151:9] // CHECK: 151:8: DeclRefExpr=c:2:14 Extent=[151:8 - 151:9] // CHECK: 151:13: UnexposedExpr= Extent=[151:13 - 151:19] // CHECK: 151:13: IntegerLiteral= Extent=[151:13 - 151:19] -// CHECK: 151:24: BinaryOperator= Extent=[151:24 - 151:50] -// CHECK: 151:24: BinaryOperator= Extent=[151:24 - 151:35] +// CHECK: 151:23: ParenExpr= Extent=[151:23 - 151:51] +// CHECK: 151:24: BinaryOperator=&& Extent=[151:24 - 151:50] +// CHECK: 151:24: BinaryOperator=>= Extent=[151:24 - 151:35] +// CHECK: 151:24: UnexposedExpr=c:2:14 Extent=[151:24 - 151:25] // CHECK: 151:24: DeclRefExpr=c:2:14 Extent=[151:24 - 151:25] // CHECK: 151:29: UnexposedExpr= Extent=[151:29 - 151:35] // CHECK: 151:29: IntegerLiteral= Extent=[151:29 - 151:35] -// CHECK: 151:39: BinaryOperator= Extent=[151:39 - 151:50] +// CHECK: 151:39: BinaryOperator=<= Extent=[151:39 - 151:50] +// CHECK: 151:39: UnexposedExpr=c:2:14 Extent=[151:39 - 151:40] // CHECK: 151:39: DeclRefExpr=c:2:14 Extent=[151:39 - 151:40] // CHECK: 151:44: UnexposedExpr= Extent=[151:44 - 151:50] // CHECK: 151:44: IntegerLiteral= Extent=[151:44 - 151:50] -// CHECK: 152:8: BinaryOperator= Extent=[152:8 - 152:19] +// CHECK: 152:8: BinaryOperator=== Extent=[152:8 - 152:19] +// CHECK: 152:8: UnexposedExpr=c:2:14 Extent=[152:8 - 152:9] // CHECK: 152:8: DeclRefExpr=c:2:14 Extent=[152:8 - 152:9] // CHECK: 152:13: UnexposedExpr= Extent=[152:13 - 152:19] // CHECK: 152:13: IntegerLiteral= Extent=[152:13 - 152:19] -// CHECK: 152:24: BinaryOperator= Extent=[152:24 - 152:50] -// CHECK: 152:24: BinaryOperator= Extent=[152:24 - 152:35] +// CHECK: 152:23: ParenExpr= Extent=[152:23 - 152:51] +// CHECK: 152:24: BinaryOperator=&& Extent=[152:24 - 152:50] +// CHECK: 152:24: BinaryOperator=>= Extent=[152:24 - 152:35] +// CHECK: 152:24: UnexposedExpr=c:2:14 Extent=[152:24 - 152:25] // CHECK: 152:24: DeclRefExpr=c:2:14 Extent=[152:24 - 152:25] // CHECK: 152:29: UnexposedExpr= Extent=[152:29 - 152:35] // CHECK: 152:29: IntegerLiteral= Extent=[152:29 - 152:35] -// CHECK: 152:39: BinaryOperator= Extent=[152:39 - 152:50] +// CHECK: 152:39: BinaryOperator=<= Extent=[152:39 - 152:50] +// CHECK: 152:39: UnexposedExpr=c:2:14 Extent=[152:39 - 152:40] // CHECK: 152:39: DeclRefExpr=c:2:14 Extent=[152:39 - 152:40] // CHECK: 152:44: UnexposedExpr= Extent=[152:44 - 152:50] // CHECK: 152:44: IntegerLiteral= Extent=[152:44 - 152:50] -// CHECK: 153:9: BinaryOperator= Extent=[153:9 - 153:35] -// CHECK: 153:9: BinaryOperator= Extent=[153:9 - 153:20] +// CHECK: 153:8: ParenExpr= Extent=[153:8 - 153:36] +// CHECK: 153:9: BinaryOperator=&& Extent=[153:9 - 153:35] +// CHECK: 153:9: BinaryOperator=>= Extent=[153:9 - 153:20] +// CHECK: 153:9: UnexposedExpr=c:2:14 Extent=[153:9 - 153:10] // CHECK: 153:9: DeclRefExpr=c:2:14 Extent=[153:9 - 153:10] // CHECK: 153:14: UnexposedExpr= Extent=[153:14 - 153:20] // CHECK: 153:14: IntegerLiteral= Extent=[153:14 - 153:20] -// CHECK: 153:24: BinaryOperator= Extent=[153:24 - 153:35] +// CHECK: 153:24: BinaryOperator=<= Extent=[153:24 - 153:35] +// CHECK: 153:24: UnexposedExpr=c:2:14 Extent=[153:24 - 153:25] // CHECK: 153:24: DeclRefExpr=c:2:14 Extent=[153:24 - 153:25] // CHECK: 153:29: UnexposedExpr= Extent=[153:29 - 153:35] // CHECK: 153:29: IntegerLiteral= Extent=[153:29 - 153:35] -// CHECK: 154:9: BinaryOperator= Extent=[154:9 - 154:35] -// CHECK: 154:9: BinaryOperator= Extent=[154:9 - 154:20] +// CHECK: 154:8: ParenExpr= Extent=[154:8 - 154:36] +// CHECK: 154:9: BinaryOperator=&& Extent=[154:9 - 154:35] +// CHECK: 154:9: BinaryOperator=>= Extent=[154:9 - 154:20] +// CHECK: 154:9: UnexposedExpr=c:2:14 Extent=[154:9 - 154:10] // CHECK: 154:9: DeclRefExpr=c:2:14 Extent=[154:9 - 154:10] // CHECK: 154:14: UnexposedExpr= Extent=[154:14 - 154:20] // CHECK: 154:14: IntegerLiteral= Extent=[154:14 - 154:20] -// CHECK: 154:24: BinaryOperator= Extent=[154:24 - 154:35] +// CHECK: 154:24: BinaryOperator=<= Extent=[154:24 - 154:35] +// CHECK: 154:24: UnexposedExpr=c:2:14 Extent=[154:24 - 154:25] // CHECK: 154:24: DeclRefExpr=c:2:14 Extent=[154:24 - 154:25] // CHECK: 154:29: UnexposedExpr= Extent=[154:29 - 154:35] // CHECK: 154:29: IntegerLiteral= Extent=[154:29 - 154:35] -// CHECK: 155:9: BinaryOperator= Extent=[155:9 - 155:35] -// CHECK: 155:9: BinaryOperator= Extent=[155:9 - 155:20] +// CHECK: 155:8: ParenExpr= Extent=[155:8 - 155:36] +// CHECK: 155:9: BinaryOperator=&& Extent=[155:9 - 155:35] +// CHECK: 155:9: BinaryOperator=>= Extent=[155:9 - 155:20] +// CHECK: 155:9: UnexposedExpr=c:2:14 Extent=[155:9 - 155:10] // CHECK: 155:9: DeclRefExpr=c:2:14 Extent=[155:9 - 155:10] // CHECK: 155:14: UnexposedExpr= Extent=[155:14 - 155:20] // CHECK: 155:14: IntegerLiteral= Extent=[155:14 - 155:20] -// CHECK: 155:24: BinaryOperator= Extent=[155:24 - 155:35] +// CHECK: 155:24: BinaryOperator=<= Extent=[155:24 - 155:35] +// CHECK: 155:24: UnexposedExpr=c:2:14 Extent=[155:24 - 155:25] // CHECK: 155:24: DeclRefExpr=c:2:14 Extent=[155:24 - 155:25] // CHECK: 155:29: UnexposedExpr= Extent=[155:29 - 155:35] // CHECK: 155:29: IntegerLiteral= Extent=[155:29 - 155:35] -// CHECK: 156:9: BinaryOperator= Extent=[156:9 - 156:35] -// CHECK: 156:9: BinaryOperator= Extent=[156:9 - 156:20] +// CHECK: 156:8: ParenExpr= Extent=[156:8 - 156:36] +// CHECK: 156:9: BinaryOperator=&& Extent=[156:9 - 156:35] +// CHECK: 156:9: BinaryOperator=>= Extent=[156:9 - 156:20] +// CHECK: 156:9: UnexposedExpr=c:2:14 Extent=[156:9 - 156:10] // CHECK: 156:9: DeclRefExpr=c:2:14 Extent=[156:9 - 156:10] // CHECK: 156:14: UnexposedExpr= Extent=[156:14 - 156:20] // CHECK: 156:14: IntegerLiteral= Extent=[156:14 - 156:20] -// CHECK: 156:24: BinaryOperator= Extent=[156:24 - 156:35] +// CHECK: 156:24: BinaryOperator=<= Extent=[156:24 - 156:35] +// CHECK: 156:24: UnexposedExpr=c:2:14 Extent=[156:24 - 156:25] // CHECK: 156:24: DeclRefExpr=c:2:14 Extent=[156:24 - 156:25] // CHECK: 156:29: UnexposedExpr= Extent=[156:29 - 156:35] // CHECK: 156:29: IntegerLiteral= Extent=[156:29 - 156:35] -// CHECK: 157:9: BinaryOperator= Extent=[157:9 - 157:35] -// CHECK: 157:9: BinaryOperator= Extent=[157:9 - 157:20] +// CHECK: 157:8: ParenExpr= Extent=[157:8 - 157:36] +// CHECK: 157:9: BinaryOperator=&& Extent=[157:9 - 157:35] +// CHECK: 157:9: BinaryOperator=>= Extent=[157:9 - 157:20] +// CHECK: 157:9: UnexposedExpr=c:2:14 Extent=[157:9 - 157:10] // CHECK: 157:9: DeclRefExpr=c:2:14 Extent=[157:9 - 157:10] // CHECK: 157:14: UnexposedExpr= Extent=[157:14 - 157:20] // CHECK: 157:14: IntegerLiteral= Extent=[157:14 - 157:20] -// CHECK: 157:24: BinaryOperator= Extent=[157:24 - 157:35] +// CHECK: 157:24: BinaryOperator=<= Extent=[157:24 - 157:35] +// CHECK: 157:24: UnexposedExpr=c:2:14 Extent=[157:24 - 157:25] // CHECK: 157:24: DeclRefExpr=c:2:14 Extent=[157:24 - 157:25] // CHECK: 157:29: UnexposedExpr= Extent=[157:29 - 157:35] // CHECK: 157:29: IntegerLiteral= Extent=[157:29 - 157:35] -// CHECK: 158:8: BinaryOperator= Extent=[158:8 - 158:19] +// CHECK: 158:8: BinaryOperator=== Extent=[158:8 - 158:19] +// CHECK: 158:8: UnexposedExpr=c:2:14 Extent=[158:8 - 158:9] // CHECK: 158:8: DeclRefExpr=c:2:14 Extent=[158:8 - 158:9] // CHECK: 158:13: UnexposedExpr= Extent=[158:13 - 158:19] // CHECK: 158:13: IntegerLiteral= Extent=[158:13 - 158:19] -// CHECK: 158:24: BinaryOperator= Extent=[158:24 - 158:50] -// CHECK: 158:24: BinaryOperator= Extent=[158:24 - 158:35] +// CHECK: 158:23: ParenExpr= Extent=[158:23 - 158:51] +// CHECK: 158:24: BinaryOperator=&& Extent=[158:24 - 158:50] +// CHECK: 158:24: BinaryOperator=>= Extent=[158:24 - 158:35] +// CHECK: 158:24: UnexposedExpr=c:2:14 Extent=[158:24 - 158:25] // CHECK: 158:24: DeclRefExpr=c:2:14 Extent=[158:24 - 158:25] // CHECK: 158:29: UnexposedExpr= Extent=[158:29 - 158:35] // CHECK: 158:29: IntegerLiteral= Extent=[158:29 - 158:35] -// CHECK: 158:39: BinaryOperator= Extent=[158:39 - 158:50] +// CHECK: 158:39: BinaryOperator=<= Extent=[158:39 - 158:50] +// CHECK: 158:39: UnexposedExpr=c:2:14 Extent=[158:39 - 158:40] // CHECK: 158:39: DeclRefExpr=c:2:14 Extent=[158:39 - 158:40] // CHECK: 158:44: UnexposedExpr= Extent=[158:44 - 158:50] // CHECK: 158:44: IntegerLiteral= Extent=[158:44 - 158:50] -// CHECK: 159:9: BinaryOperator= Extent=[159:9 - 159:35] -// CHECK: 159:9: BinaryOperator= Extent=[159:9 - 159:20] +// CHECK: 159:8: ParenExpr= Extent=[159:8 - 159:36] +// CHECK: 159:9: BinaryOperator=&& Extent=[159:9 - 159:35] +// CHECK: 159:9: BinaryOperator=>= Extent=[159:9 - 159:20] +// CHECK: 159:9: UnexposedExpr=c:2:14 Extent=[159:9 - 159:10] // CHECK: 159:9: DeclRefExpr=c:2:14 Extent=[159:9 - 159:10] // CHECK: 159:14: UnexposedExpr= Extent=[159:14 - 159:20] // CHECK: 159:14: IntegerLiteral= Extent=[159:14 - 159:20] -// CHECK: 159:24: BinaryOperator= Extent=[159:24 - 159:35] +// CHECK: 159:24: BinaryOperator=<= Extent=[159:24 - 159:35] +// CHECK: 159:24: UnexposedExpr=c:2:14 Extent=[159:24 - 159:25] // CHECK: 159:24: DeclRefExpr=c:2:14 Extent=[159:24 - 159:25] // CHECK: 159:29: UnexposedExpr= Extent=[159:29 - 159:35] // CHECK: 159:29: IntegerLiteral= Extent=[159:29 - 159:35] -// CHECK: 160:8: BinaryOperator= Extent=[160:8 - 160:19] +// CHECK: 160:8: BinaryOperator=== Extent=[160:8 - 160:19] +// CHECK: 160:8: UnexposedExpr=c:2:14 Extent=[160:8 - 160:9] // CHECK: 160:8: DeclRefExpr=c:2:14 Extent=[160:8 - 160:9] // CHECK: 160:13: UnexposedExpr= Extent=[160:13 - 160:19] // CHECK: 160:13: IntegerLiteral= Extent=[160:13 - 160:19] // CHECK: 160:23: ParenExpr= Extent=[160:23 - 160:51] -// CHECK: 160:24: BinaryOperator= Extent=[160:24 - 160:50] -// CHECK: 160:24: BinaryOperator= Extent=[160:24 - 160:35] +// CHECK: 160:24: BinaryOperator=&& Extent=[160:24 - 160:50] +// CHECK: 160:24: BinaryOperator=>= Extent=[160:24 - 160:35] +// CHECK: 160:24: UnexposedExpr=c:2:14 Extent=[160:24 - 160:25] // CHECK: 160:24: DeclRefExpr=c:2:14 Extent=[160:24 - 160:25] // CHECK: 160:29: UnexposedExpr= Extent=[160:29 - 160:35] // CHECK: 160:29: IntegerLiteral= Extent=[160:29 - 160:35] -// CHECK: 160:39: BinaryOperator= Extent=[160:39 - 160:50] +// CHECK: 160:39: BinaryOperator=<= Extent=[160:39 - 160:50] +// CHECK: 160:39: UnexposedExpr=c:2:14 Extent=[160:39 - 160:40] // CHECK: 160:39: DeclRefExpr=c:2:14 Extent=[160:39 - 160:40] // CHECK: 160:44: UnexposedExpr= Extent=[160:44 - 160:50] // CHECK: 160:44: IntegerLiteral= Extent=[160:44 - 160:50] - diff --git a/clang/test/Index/preamble.c b/clang/test/Index/preamble.c index ae8e1aa6a52cf3..08e62ecf449b97 100644 --- a/clang/test/Index/preamble.c +++ b/clang/test/Index/preamble.c @@ -14,7 +14,7 @@ void f(int x) { // RUN: env CINDEXTEST_EDITING=1 c-index-test -test-load-source-reparse 5 local -I %S/Inputs -include %t %s -Wunused-macros 2> %t.stderr.txt | FileCheck %s // RUN: FileCheck -check-prefix CHECK-DIAG %s < %t.stderr.txt // CHECK: preamble.h:1:12: FunctionDecl=bar:1:12 (Definition) Extent=[1:1 - 6:2] -// CHECK: preamble.h:4:3: BinaryOperator= Extent=[4:3 - 4:13] +// CHECK: preamble.h:4:3: BinaryOperator== Extent=[4:3 - 4:13] // CHECK: preamble.h:4:3: DeclRefExpr=ptr:2:8 Extent=[4:3 - 4:6] // CHECK: preamble.h:4:9: UnexposedExpr=ptr1:3:10 Extent=[4:9 - 4:13] // CHECK: preamble.h:4:9: DeclRefExpr=ptr1:3:10 Extent=[4:9 - 4:13] diff --git a/clang/test/Index/print-type.c b/clang/test/Index/print-type.c index 1e5a7248d89cbd..7375644f100590 100644 --- a/clang/test/Index/print-type.c +++ b/clang/test/Index/print-type.c @@ -51,8 +51,8 @@ _Atomic(unsigned long) aul; // CHECK: TypeRef=FooType:1:13 [type=FooType] [typekind=Typedef] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] // CHECK: DeclRefExpr=z:3:33 [type=FooType] [typekind=Elaborated] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] // CHECK: ReturnStmt= [type=] [typekind=Invalid] [isPOD=0] -// CHECK: BinaryOperator= [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] -// CHECK: BinaryOperator= [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] +// CHECK: BinaryOperator=+ [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] +// CHECK: BinaryOperator=+ [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] // CHECK: DeclRefExpr=p:3:13 [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] // CHECK: DeclRefExpr=z:3:33 [type=FooType] [typekind=Elaborated] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] // CHECK: ArraySubscriptExpr= [type=int] [typekind=Int] [isPOD=1] diff --git a/clang/test/Index/print-type.cpp b/clang/test/Index/print-type.cpp index 8c3d4c254964a2..b64469e921a8dd 100644 --- a/clang/test/Index/print-type.cpp +++ b/clang/test/Index/print-type.cpp @@ -124,7 +124,7 @@ inline namespace InlineNS {} // CHECK: UnexposedExpr=z:22:35 [type=FooType] [typekind=Elaborated] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] // CHECK: DeclRefExpr=z:22:35 [type=FooType] [typekind=Elaborated] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] // CHECK: ReturnStmt= [type=] [typekind=Invalid] [isPOD=0] -// CHECK: BinaryOperator= [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] +// CHECK: BinaryOperator=+ [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] // CHECK: UnexposedExpr=p:22:15 [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] // CHECK: DeclRefExpr=p:22:15 [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int] // CHECK: UnexposedExpr=z:22:35 [type=FooType] [typekind=Elaborated] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1] diff --git a/clang/test/Index/recursive-cxx-member-calls.cpp b/clang/test/Index/recursive-cxx-member-calls.cpp index be908c506e7476..11c011a432cd4e 100644 --- a/clang/test/Index/recursive-cxx-member-calls.cpp +++ b/clang/test/Index/recursive-cxx-member-calls.cpp @@ -21,7 +21,7 @@ namespace clang { AT_noinline, AT_no_instrument_function, AT_nonnull, AT_noreturn, AT_nothrow, AT_nsobject, AT_objc_exception, AT_override, AT_cf_returns_not_retained, AT_cf_returns_retained, - AT_ns_returns_not_retained, AT_ns_returns_retained, AT_objc_gc, + AT_ns_returns_not_retained, AT_ns_returns_retained, AT_objc_gc, AT_overloadable, AT_ownership_holds, AT_ownership_returns, AT_ownership_takes, AT_packed, AT_pascal, AT_pure, AT_regparm, AT_section, AT_sentinel, AT_stdcall, AT_thiscall, AT_transparent_union, @@ -467,7 +467,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "{" [45:41 - 45:42] CompoundStmt= // CHECK-tokens: Keyword: "return" [45:43 - 45:49] ReturnStmt= // CHECK-tokens: Identifier: "a" [45:50 - 45:51] DeclRefExpr=a:45:28 -// CHECK-tokens: Punctuation: "<" [45:52 - 45:53] BinaryOperator= +// CHECK-tokens: Punctuation: "<" [45:52 - 45:53] BinaryOperator=< // CHECK-tokens: Identifier: "b" [45:54 - 45:55] DeclRefExpr=b:45:38 // CHECK-tokens: Punctuation: "?" [45:56 - 45:57] ConditionalOperator= // CHECK-tokens: Identifier: "a" [45:58 - 45:59] DeclRefExpr=a:45:28 @@ -566,11 +566,11 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "{" [52:43 - 52:44] CompoundStmt= // CHECK-tokens: Keyword: "return" [53:5 - 53:11] ReturnStmt= // CHECK-tokens: Identifier: "Length" [53:12 - 53:18] MemberRefExpr=Length:44:10 -// CHECK-tokens: Punctuation: ">=" [53:19 - 53:21] BinaryOperator= +// CHECK-tokens: Punctuation: ">=" [53:19 - 53:21] BinaryOperator=>= // CHECK-tokens: Identifier: "Prefix" [53:22 - 53:28] DeclRefExpr=Prefix:52:29 // CHECK-tokens: Punctuation: "." [53:28 - 53:29] MemberRefExpr=Length:44:10 // CHECK-tokens: Identifier: "Length" [53:29 - 53:35] MemberRefExpr=Length:44:10 -// CHECK-tokens: Punctuation: "&&" [53:36 - 53:38] BinaryOperator= +// CHECK-tokens: Punctuation: "&&" [53:36 - 53:38] BinaryOperator=&& // CHECK-tokens: Identifier: "memcmp" [54:11 - 54:17] DeclRefExpr=memcmp:7:7 // CHECK-tokens: Punctuation: "(" [54:17 - 54:18] CallExpr=memcmp:7:7 // CHECK-tokens: Identifier: "Data" [54:18 - 54:22] MemberRefExpr=Data:43:15 @@ -583,7 +583,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "." [54:43 - 54:44] MemberRefExpr=Length:44:10 // CHECK-tokens: Identifier: "Length" [54:44 - 54:50] MemberRefExpr=Length:44:10 // CHECK-tokens: Punctuation: ")" [54:50 - 54:51] CallExpr=memcmp:7:7 -// CHECK-tokens: Punctuation: "==" [54:52 - 54:54] BinaryOperator= +// CHECK-tokens: Punctuation: "==" [54:52 - 54:54] BinaryOperator=== // CHECK-tokens: Literal: "0" [54:55 - 54:56] IntegerLiteral= // CHECK-tokens: Punctuation: ";" [54:56 - 54:57] CompoundStmt= // CHECK-tokens: Punctuation: "}" [55:3 - 55:4] CompoundStmt= @@ -597,17 +597,17 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "{" [56:41 - 56:42] CompoundStmt= // CHECK-tokens: Keyword: "return" [57:5 - 57:11] ReturnStmt= // CHECK-tokens: Identifier: "Length" [57:12 - 57:18] MemberRefExpr=Length:44:10 -// CHECK-tokens: Punctuation: ">=" [57:19 - 57:21] BinaryOperator= +// CHECK-tokens: Punctuation: ">=" [57:19 - 57:21] BinaryOperator=>= // CHECK-tokens: Identifier: "Suffix" [57:22 - 57:28] DeclRefExpr=Suffix:56:27 // CHECK-tokens: Punctuation: "." [57:28 - 57:29] MemberRefExpr=Length:44:10 // CHECK-tokens: Identifier: "Length" [57:29 - 57:35] MemberRefExpr=Length:44:10 -// CHECK-tokens: Punctuation: "&&" [57:36 - 57:38] BinaryOperator= +// CHECK-tokens: Punctuation: "&&" [57:36 - 57:38] BinaryOperator=&& // CHECK-tokens: Identifier: "memcmp" [58:7 - 58:13] DeclRefExpr=memcmp:7:7 // CHECK-tokens: Punctuation: "(" [58:13 - 58:14] CallExpr=memcmp:7:7 // CHECK-tokens: Identifier: "end" [58:14 - 58:17] MemberRefExpr=end:50:12 // CHECK-tokens: Punctuation: "(" [58:17 - 58:18] CallExpr=end:50:12 // CHECK-tokens: Punctuation: ")" [58:18 - 58:19] CallExpr=end:50:12 -// CHECK-tokens: Punctuation: "-" [58:20 - 58:21] BinaryOperator= +// CHECK-tokens: Punctuation: "-" [58:20 - 58:21] BinaryOperator=- // CHECK-tokens: Identifier: "Suffix" [58:22 - 58:28] DeclRefExpr=Suffix:56:27 // CHECK-tokens: Punctuation: "." [58:28 - 58:29] MemberRefExpr=Length:44:10 // CHECK-tokens: Identifier: "Length" [58:29 - 58:35] MemberRefExpr=Length:44:10 @@ -620,7 +620,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "." [58:56 - 58:57] MemberRefExpr=Length:44:10 // CHECK-tokens: Identifier: "Length" [58:57 - 58:63] MemberRefExpr=Length:44:10 // CHECK-tokens: Punctuation: ")" [58:63 - 58:64] CallExpr=memcmp:7:7 -// CHECK-tokens: Punctuation: "==" [58:65 - 58:67] BinaryOperator= +// CHECK-tokens: Punctuation: "==" [58:65 - 58:67] BinaryOperator=== // CHECK-tokens: Literal: "0" [58:68 - 58:69] IntegerLiteral= // CHECK-tokens: Punctuation: ";" [58:69 - 58:70] CompoundStmt= // CHECK-tokens: Punctuation: "}" [59:3 - 59:4] CompoundStmt= @@ -641,7 +641,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Identifier: "StringRef" [61:12 - 61:21] TypeRef=class llvm::StringRef:38:7 // CHECK-tokens: Punctuation: "(" [61:21 - 61:22] CallExpr=StringRef:49:3 // CHECK-tokens: Identifier: "Data" [61:22 - 61:26] MemberRefExpr=Data:43:15 -// CHECK-tokens: Punctuation: "+" [61:27 - 61:28] BinaryOperator= +// CHECK-tokens: Punctuation: "+" [61:27 - 61:28] BinaryOperator=+ // CHECK-tokens: Identifier: "Start" [61:29 - 61:34] DeclRefExpr=Start:60:27 // CHECK-tokens: Punctuation: "," [61:34 - 61:35] CallExpr=StringRef:49:3 // CHECK-tokens: Identifier: "min" [61:36 - 61:39] DeclRefExpr=min:45:17 @@ -649,7 +649,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Identifier: "N" [61:40 - 61:41] DeclRefExpr=N:60:41 // CHECK-tokens: Punctuation: "," [61:41 - 61:42] CallExpr=min:45:17 // CHECK-tokens: Identifier: "Length" [61:43 - 61:49] MemberRefExpr=Length:44:10 -// CHECK-tokens: Punctuation: "-" [61:50 - 61:51] BinaryOperator= +// CHECK-tokens: Punctuation: "-" [61:50 - 61:51] BinaryOperator=- // CHECK-tokens: Identifier: "Start" [61:52 - 61:57] DeclRefExpr=Start:60:27 // CHECK-tokens: Punctuation: ")" [61:57 - 61:58] CallExpr=min:45:17 // CHECK-tokens: Punctuation: ")" [61:58 - 61:59] CallExpr=StringRef:49:3 @@ -738,7 +738,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: ")" [74:47 - 74:48] ParenExpr= // CHECK-tokens: Punctuation: "->" [74:48 - 74:50] MemberRefExpr=second:4:55 // CHECK-tokens: Identifier: "second" [74:50 - 74:56] MemberRefExpr=second:4:55 -// CHECK-tokens: Punctuation: "-" [74:57 - 74:58] BinaryOperator= +// CHECK-tokens: Punctuation: "-" [74:57 - 74:58] BinaryOperator=- // CHECK-tokens: Literal: "2" [74:59 - 74:60] IntegerLiteral= // CHECK-tokens: Punctuation: ";" [74:60 - 74:61] DeclStmt= // CHECK-tokens: Keyword: "return" [75:5 - 75:11] ReturnStmt= @@ -752,7 +752,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Literal: "0" [75:27 - 75:28] IntegerLiteral= // CHECK-tokens: Punctuation: "]" [75:28 - 75:29] ArraySubscriptExpr= // CHECK-tokens: Punctuation: ")" [75:29 - 75:30] ParenExpr= -// CHECK-tokens: Punctuation: "|" [75:31 - 75:32] BinaryOperator= +// CHECK-tokens: Punctuation: "|" [75:31 - 75:32] BinaryOperator=| // CHECK-tokens: Punctuation: "(" [75:33 - 75:34] ParenExpr= // CHECK-tokens: Punctuation: "(" [75:34 - 75:35] ParenExpr= // CHECK-tokens: Punctuation: "(" [75:35 - 75:36] CStyleCastExpr= @@ -763,11 +763,11 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Literal: "1" [75:48 - 75:49] IntegerLiteral= // CHECK-tokens: Punctuation: "]" [75:49 - 75:50] ArraySubscriptExpr= // CHECK-tokens: Punctuation: ")" [75:50 - 75:51] ParenExpr= -// CHECK-tokens: Punctuation: "<<" [75:52 - 75:54] BinaryOperator= +// CHECK-tokens: Punctuation: "<<" [75:52 - 75:54] BinaryOperator=<< // CHECK-tokens: Literal: "8" [75:55 - 75:56] IntegerLiteral= // CHECK-tokens: Punctuation: ")" [75:56 - 75:57] ParenExpr= // CHECK-tokens: Punctuation: ")" [75:57 - 75:58] ParenExpr= -// CHECK-tokens: Punctuation: "-" [75:59 - 75:60] BinaryOperator= +// CHECK-tokens: Punctuation: "-" [75:59 - 75:60] BinaryOperator=- // CHECK-tokens: Literal: "1" [75:61 - 75:62] IntegerLiteral= // CHECK-tokens: Punctuation: ";" [75:62 - 75:63] CompoundStmt= // CHECK-tokens: Punctuation: "}" [76:3 - 76:4] CompoundStmt= @@ -924,7 +924,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Punctuation: "(" [102:26 - 102:27] CallExpr=startswith:52:8 // CHECK-tokens: Literal: ""__"" [102:27 - 102:31] StringLiteral= // CHECK-tokens: Punctuation: ")" [102:31 - 102:32] CallExpr=startswith:52:8 -// CHECK-tokens: Punctuation: "&&" [102:33 - 102:35] BinaryOperator= +// CHECK-tokens: Punctuation: "&&" [102:33 - 102:35] BinaryOperator=&& // CHECK-tokens: Identifier: "AttrName" [102:36 - 102:44] DeclRefExpr=AttrName:101:19 // CHECK-tokens: Punctuation: "." [102:44 - 102:45] MemberRefExpr=endswith:56:8 // CHECK-tokens: Identifier: "endswith" [102:45 - 102:53] MemberRefExpr=endswith:56:8 @@ -945,7 +945,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK-tokens: Identifier: "size" [103:44 - 103:48] MemberRefExpr=size:51:10 // CHECK-tokens: Punctuation: "(" [103:48 - 103:49] CallExpr=size:51:10 // CHECK-tokens: Punctuation: ")" [103:49 - 103:50] CallExpr=size:51:10 -// CHECK-tokens: Punctuation: "-" [103:51 - 103:52] BinaryOperator= +// CHECK-tokens: Punctuation: "-" [103:51 - 103:52] BinaryOperator=- // CHECK-tokens: Literal: "4" [103:53 - 103:54] IntegerLiteral= // CHECK-tokens: Punctuation: ")" [103:54 - 103:55] CallExpr=substr:60:13 // CHECK-tokens: Punctuation: ";" [103:55 - 103:56] CompoundStmt= @@ -1647,7 +1647,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 45:41: CompoundStmt= Extent=[45:41 - 45:66] // CHECK: 45:43: ReturnStmt= Extent=[45:43 - 45:63] // CHECK: 45:50: ConditionalOperator= Extent=[45:50 - 45:63] -// CHECK: 45:50: BinaryOperator= Extent=[45:50 - 45:55] +// CHECK: 45:50: BinaryOperator=< Extent=[45:50 - 45:55] // CHECK: 45:50: DeclRefExpr=a:45:28 Extent=[45:50 - 45:51] // CHECK: 45:54: DeclRefExpr=b:45:38 Extent=[45:54 - 45:55] // CHECK: 45:58: DeclRefExpr=a:45:28 Extent=[45:58 - 45:59] @@ -1695,13 +1695,13 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 52:19: TypeRef=class llvm::StringRef:38:7 Extent=[52:19 - 52:28] // CHECK: 52:43: CompoundStmt= Extent=[52:43 - 55:4] // CHECK: 53:5: ReturnStmt= Extent=[53:5 - 54:56] -// CHECK: 53:12: BinaryOperator= Extent=[53:12 - 54:56] -// CHECK: 53:12: BinaryOperator= Extent=[53:12 - 53:35] +// CHECK: 53:12: BinaryOperator=&& Extent=[53:12 - 54:56] +// CHECK: 53:12: BinaryOperator=>= Extent=[53:12 - 53:35] // CHECK: 53:12: UnexposedExpr=Length:44:10 Extent=[53:12 - 53:18] // CHECK: 53:12: MemberRefExpr=Length:44:10 Extent=[53:12 - 53:18] // CHECK: 53:29: MemberRefExpr=Length:44:10 SingleRefName=[53:29 - 53:35] RefName=[53:29 - 53:35] Extent=[53:22 - 53:35] // CHECK: 53:22: DeclRefExpr=Prefix:52:29 Extent=[53:22 - 53:28] -// CHECK: 54:11: BinaryOperator= Extent=[54:11 - 54:56] +// CHECK: 54:11: BinaryOperator=== Extent=[54:11 - 54:56] // CHECK: 54:11: CallExpr=memcmp:7:7 Extent=[54:11 - 54:51] // CHECK: 54:11: UnexposedExpr=memcmp:7:7 Extent=[54:11 - 54:17] // CHECK: 54:11: DeclRefExpr=memcmp:7:7 Extent=[54:11 - 54:17] @@ -1718,18 +1718,18 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 56:17: TypeRef=class llvm::StringRef:38:7 Extent=[56:17 - 56:26] // CHECK: 56:41: CompoundStmt= Extent=[56:41 - 59:4] // CHECK: 57:5: ReturnStmt= Extent=[57:5 - 58:69] -// CHECK: 57:12: BinaryOperator= Extent=[57:12 - 58:69] -// CHECK: 57:12: BinaryOperator= Extent=[57:12 - 57:35] +// CHECK: 57:12: BinaryOperator=&& Extent=[57:12 - 58:69] +// CHECK: 57:12: BinaryOperator=>= Extent=[57:12 - 57:35] // CHECK: 57:12: UnexposedExpr=Length:44:10 Extent=[57:12 - 57:18] // CHECK: 57:12: MemberRefExpr=Length:44:10 Extent=[57:12 - 57:18] // CHECK: 57:29: MemberRefExpr=Length:44:10 SingleRefName=[57:29 - 57:35] RefName=[57:29 - 57:35] Extent=[57:22 - 57:35] // CHECK: 57:22: DeclRefExpr=Suffix:56:27 Extent=[57:22 - 57:28] -// CHECK: 58:7: BinaryOperator= Extent=[58:7 - 58:69] +// CHECK: 58:7: BinaryOperator=== Extent=[58:7 - 58:69] // CHECK: 58:7: CallExpr=memcmp:7:7 Extent=[58:7 - 58:64] // CHECK: 58:7: UnexposedExpr=memcmp:7:7 Extent=[58:7 - 58:13] // CHECK: 58:7: DeclRefExpr=memcmp:7:7 Extent=[58:7 - 58:13] // CHECK: 58:14: UnexposedExpr= Extent=[58:14 - 58:35] -// CHECK: 58:14: BinaryOperator= Extent=[58:14 - 58:35] +// CHECK: 58:14: BinaryOperator=- Extent=[58:14 - 58:35] // CHECK: 58:14: CallExpr=end:50:12 Extent=[58:14 - 58:19] // CHECK: 58:14: MemberRefExpr=end:50:12 Extent=[58:14 - 58:17] // CHECK: 58:29: MemberRefExpr=Length:44:10 SingleRefName=[58:29 - 58:35] RefName=[58:29 - 58:35] Extent=[58:22 - 58:35] @@ -1753,7 +1753,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 61:12: UnexposedExpr=StringRef:49:3 Extent=[61:12 - 61:59] // CHECK: 61:12: CallExpr=StringRef:49:3 Extent=[61:12 - 61:59] // CHECK: 61:12: TypeRef=class llvm::StringRef:38:7 Extent=[61:12 - 61:21] -// CHECK: 61:22: BinaryOperator= Extent=[61:22 - 61:34] +// CHECK: 61:22: BinaryOperator=+ Extent=[61:22 - 61:34] // CHECK: 61:22: UnexposedExpr=Data:43:15 Extent=[61:22 - 61:26] // CHECK: 61:22: MemberRefExpr=Data:43:15 Extent=[61:22 - 61:26] // CHECK: 61:29: DeclRefExpr=Start:60:27 Extent=[61:29 - 61:34] @@ -1761,7 +1761,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 61:36: UnexposedExpr=min:45:17 Extent=[61:36 - 61:39] // CHECK: 61:36: DeclRefExpr=min:45:17 Extent=[61:36 - 61:39] // CHECK: 61:40: DeclRefExpr=N:60:41 Extent=[61:40 - 61:41] -// CHECK: 61:43: BinaryOperator= Extent=[61:43 - 61:57] +// CHECK: 61:43: BinaryOperator=- Extent=[61:43 - 61:57] // CHECK: 61:43: UnexposedExpr=Length:44:10 Extent=[61:43 - 61:49] // CHECK: 61:43: MemberRefExpr=Length:44:10 Extent=[61:43 - 61:49] // CHECK: 61:52: DeclRefExpr=Start:60:27 Extent=[61:52 - 61:57] @@ -1789,7 +1789,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 73:25: TypeRef=class clang::IdentifierInfo:66:7 Extent=[73:25 - 73:39] // CHECK: 74:5: DeclStmt= Extent=[74:5 - 74:61] // CHECK: 74:17: VarDecl=p:74:17 (Definition) Extent=[74:5 - 74:60] -// CHECK: 74:21: BinaryOperator= Extent=[74:21 - 74:60] +// CHECK: 74:21: BinaryOperator=- Extent=[74:21 - 74:60] // CHECK: 74:50: UnexposedExpr=second:4:55 Extent=[74:21 - 74:56] // CHECK: 74:50: MemberRefExpr=second:4:55 SingleRefName=[74:50 - 74:56] RefName=[74:50 - 74:56] Extent=[74:21 - 74:56] // CHECK: 74:21: ParenExpr= Extent=[74:21 - 74:48] @@ -1798,9 +1798,9 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 74:43: CXXThisExpr= Extent=[74:43 - 74:47] // CHECK: 74:59: IntegerLiteral= Extent=[74:59 - 74:60] // CHECK: 75:5: ReturnStmt= Extent=[75:5 - 75:62] -// CHECK: 75:12: BinaryOperator= Extent=[75:12 - 75:62] +// CHECK: 75:12: BinaryOperator=- Extent=[75:12 - 75:62] // CHECK: 75:12: ParenExpr= Extent=[75:12 - 75:58] -// CHECK: 75:13: BinaryOperator= Extent=[75:13 - 75:57] +// CHECK: 75:13: BinaryOperator=| Extent=[75:13 - 75:57] // CHECK: 75:13: ParenExpr= Extent=[75:13 - 75:30] // CHECK: 75:14: CStyleCastExpr= Extent=[75:14 - 75:29] // CHECK: 75:25: UnexposedExpr= Extent=[75:25 - 75:29] @@ -1809,7 +1809,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 75:25: DeclRefExpr=p:74:17 Extent=[75:25 - 75:26] // CHECK: 75:27: IntegerLiteral= Extent=[75:27 - 75:28] // CHECK: 75:33: ParenExpr= Extent=[75:33 - 75:57] -// CHECK: 75:34: BinaryOperator= Extent=[75:34 - 75:56] +// CHECK: 75:34: BinaryOperator=<< Extent=[75:34 - 75:56] // CHECK: 75:34: ParenExpr= Extent=[75:34 - 75:51] // CHECK: 75:35: CStyleCastExpr= Extent=[75:35 - 75:50] // CHECK: 75:46: UnexposedExpr= Extent=[75:46 - 75:50] @@ -1878,7 +1878,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 101:36: MemberRefExpr=getName:77:19 SingleRefName=[101:36 - 101:43] RefName=[101:36 - 101:43] Extent=[101:30 - 101:43] // CHECK: 101:30: DeclRefExpr=Name:100:67 Extent=[101:30 - 101:34] // CHECK: 102:3: IfStmt= Extent=[102:3 - 103:55] -// CHECK: 102:7: BinaryOperator= Extent=[102:7 - 102:59] +// CHECK: 102:7: BinaryOperator=&& Extent=[102:7 - 102:59] // CHECK: 102:7: CallExpr=startswith:52:8 Extent=[102:7 - 102:32] // CHECK: 102:16: MemberRefExpr=startswith:52:8 SingleRefName=[102:16 - 102:26] RefName=[102:16 - 102:26] Extent=[102:7 - 102:26] // CHECK: 102:7: UnexposedExpr=AttrName:101:19 Extent=[102:7 - 102:15] @@ -1910,7 +1910,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) { // CHECK: 103:16: DeclRefExpr=AttrName:101:19 Extent=[103:16 - 103:24] // CHECK: 103:32: UnexposedExpr= Extent=[103:32 - 103:33] // CHECK: 103:32: IntegerLiteral= Extent=[103:32 - 103:33] -// CHECK: 103:35: BinaryOperator= Extent=[103:35 - 103:54] +// CHECK: 103:35: BinaryOperator=- Extent=[103:35 - 103:54] // CHECK: 103:35: CallExpr=size:51:10 Extent=[103:35 - 103:50] // CHECK: 103:44: MemberRefExpr=size:51:10 SingleRefName=[103:44 - 103:48] RefName=[103:44 - 103:48] Extent=[103:35 - 103:48] // CHECK: 103:35: UnexposedExpr=AttrName:101:19 Extent=[103:35 - 103:43] diff --git a/clang/test/Index/remap-load.c b/clang/test/Index/remap-load.c index f886cea7da7545..581095a52471bb 100644 --- a/clang/test/Index/remap-load.c +++ b/clang/test/Index/remap-load.c @@ -4,7 +4,7 @@ // CHECK: remap-load.c:1:13: ParmDecl=parm1:1:13 (Definition) Extent=[1:9 - 1:18] // CHECK: remap-load.c:1:26: ParmDecl=parm2:1:26 (Definition) Extent=[1:20 - 1:31] // CHECK: remap-load.c:2:10: UnexposedExpr= Extent=[2:10 - 2:23] -// CHECK: remap-load.c:2:10: BinaryOperator= Extent=[2:10 - 2:23] +// CHECK: remap-load.c:2:10: BinaryOperator=+ Extent=[2:10 - 2:23] // CHECK: remap-load.c:2:10: UnexposedExpr=parm1:1:13 Extent=[2:10 - 2:15] // CHECK: remap-load.c:2:10: DeclRefExpr=parm1:1:13 Extent=[2:10 - 2:15] // CHECK: remap-load.c:2:18: DeclRefExpr=parm2:1:26 Extent=[2:18 - 2:23] diff --git a/clang/test/Index/usrs.m b/clang/test/Index/usrs.m index 9a1407ce518940..2114b71e67607f 100644 --- a/clang/test/Index/usrs.m +++ b/clang/test/Index/usrs.m @@ -182,7 +182,7 @@ -(int)methodWithFn:(void (*)(int *p))fn; // CHECK-source: usrs.m:3:40: ParmDecl=y:3:40 (Definition) Extent=[3:36 - 3:41] // CHECK-source: usrs.m:3:43: CompoundStmt= Extent=[3:43 - 3:60] // CHECK-source: usrs.m:3:45: ReturnStmt= Extent=[3:45 - 3:57] -// CHECK-source: usrs.m:3:52: BinaryOperator= Extent=[3:52 - 3:57] +// CHECK-source: usrs.m:3:52: BinaryOperator=+ Extent=[3:52 - 3:57] // CHECK-source: usrs.m:3:52: DeclRefExpr=x:3:33 Extent=[3:52 - 3:53] // CHECK-source: usrs.m:3:56: DeclRefExpr=y:3:40 Extent=[3:56 - 3:57] // CHECK-source: usrs.m:5:1: EnumDecl=enum (unnamed at {{.*}}):5:1 (Definition) Extent=[5:1 - 8:2] diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c index 7b993f6849363b..cdbe1e95cba965 100644 --- a/clang/test/Misc/warning-flags.c +++ b/clang/test/Misc/warning-flags.c @@ -18,7 +18,7 @@ This test serves two purposes: The list of warnings below should NEVER grow. It should gradually shrink to 0. -CHECK: Warnings without flags (64): +CHECK: Warnings without flags (65): CHECK-NEXT: ext_expected_semi_decl_list CHECK-NEXT: ext_missing_whitespace_after_macro_name @@ -61,6 +61,7 @@ CHECK-NEXT: warn_invalid_cpu_supports CHECK-NEXT: warn_maynot_respond CHECK-NEXT: warn_method_param_redefinition CHECK-NEXT: warn_missing_case_for_condition +CHECK-NEXT: warn_missing_dependent_template_keyword CHECK-NEXT: warn_missing_whitespace_after_macro_name CHECK-NEXT: warn_mt_message CHECK-NEXT: warn_no_constructor_for_refconst diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp index 4a321b24b8c313..6693473e892d2a 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp @@ -2567,16 +2567,16 @@ int main() { // CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK3-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED12:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTBOUND_ZERO_ADDR17:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED11:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTBOUND_ZERO_ADDR16:%.*]] = alloca i32, align 4 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK3-NEXT: store ptr [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -2598,29 +2598,29 @@ int main() { // CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP7]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL1]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE7:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE6:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: // CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]] // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]] -// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]] +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 // CHECK3-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK3-NEXT: [[FROMBOOL4:%.*]] = zext i1 [[TOBOOL3]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] +// CHECK3-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK3-NEXT: [[STOREDV3:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV3]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL5]], label [[OMP_IF_THEN6:%.*]], label [[OMP_IF_ELSE:%.*]] -// CHECK3: omp_if.then6: +// CHECK3-NEXT: [[LOADEDV4:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3: omp_if.then5: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined, i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]), !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] // CHECK3: omp_if.else: @@ -2639,48 +2639,48 @@ int main() { // CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] // CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_IF_END22:%.*]] -// CHECK3: omp_if.else7: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND8:%.*]] -// CHECK3: omp.inner.for.cond8: +// CHECK3-NEXT: br label [[OMP_IF_END21:%.*]] +// CHECK3: omp_if.else6: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND7:%.*]] +// CHECK3: omp.inner.for.cond7: // CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP9:%.*]] = icmp sle i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY10:%.*]], label [[OMP_INNER_FOR_END21:%.*]] -// CHECK3: omp.inner.for.body10: +// CHECK3-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY9:%.*]], label [[OMP_INNER_FOR_END20:%.*]] +// CHECK3: omp.inner.for.body9: // CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 // CHECK3-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL11:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK3-NEXT: [[FROMBOOL13:%.*]] = zext i1 [[TOBOOL11]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL13]], ptr [[DOTCAPTURE_EXPR__CASTED12]], align 1 -// CHECK3-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED12]], align 8 +// CHECK3-NEXT: [[LOADEDV10:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK3-NEXT: [[STOREDV12:%.*]] = zext i1 [[LOADEDV10]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV12]], ptr [[DOTCAPTURE_EXPR__CASTED11]], align 1 +// CHECK3-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED11]], align 8 // CHECK3-NEXT: [[TMP28:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL14:%.*]] = trunc i8 [[TMP28]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL14]], label [[OMP_IF_THEN15:%.*]], label [[OMP_IF_ELSE16:%.*]] -// CHECK3: omp_if.then15: +// CHECK3-NEXT: [[LOADEDV13:%.*]] = trunc i8 [[TMP28]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV13]], label [[OMP_IF_THEN14:%.*]], label [[OMP_IF_ELSE15:%.*]] +// CHECK3: omp_if.then14: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1, i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) -// CHECK3-NEXT: br label [[OMP_IF_END18:%.*]] -// CHECK3: omp_if.else16: +// CHECK3-NEXT: br label [[OMP_IF_END17:%.*]] +// CHECK3: omp_if.else15: // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]) // CHECK3-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR17]], align 4 -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]] +// CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR16]], align 4 +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR16]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]) -// CHECK3-NEXT: br label [[OMP_IF_END18]] -// CHECK3: omp_if.end18: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC19:%.*]] -// CHECK3: omp.inner.for.inc19: +// CHECK3-NEXT: br label [[OMP_IF_END17]] +// CHECK3: omp_if.end17: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC18:%.*]] +// CHECK3: omp.inner.for.inc18: // CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] -// CHECK3-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP38:![0-9]+]] -// CHECK3: omp.inner.for.end21: -// CHECK3-NEXT: br label [[OMP_IF_END22]] -// CHECK3: omp_if.end22: +// CHECK3-NEXT: [[ADD19:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] +// CHECK3-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND7]], !llvm.loop [[LOOP38:![0-9]+]] +// CHECK3: omp.inner.for.end20: +// CHECK3-NEXT: br label [[OMP_IF_END21]] +// CHECK3: omp_if.end21: // CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK3: omp.loop.exit: // CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) @@ -2725,8 +2725,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -2852,8 +2852,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -3969,60 +3969,60 @@ int main() { // CHECK7-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK7-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK7-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK7-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK7-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK7-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK7: omp_if.then: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK7: omp.inner.for.cond22: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK7: omp.inner.for.cond21: // CHECK7-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]] // CHECK7-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK7-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK7: omp.inner.for.body24: +// CHECK7-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK7-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK7: omp.inner.for.body23: // CHECK7-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK7-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK7-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK7-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK7-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK7-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK7: omp.body.continue27: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK7: omp.inner.for.inc28: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK7: omp.body.continue26: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK7: omp.inner.for.inc27: // CHECK7-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK7-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK7: omp.inner.for.end30: +// CHECK7-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK7-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK7: omp.inner.for.end29: // CHECK7-NEXT: br label [[OMP_IF_END:%.*]] // CHECK7: omp_if.else: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK7: omp.inner.for.cond31: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK7: omp.inner.for.cond30: // CHECK7-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK7-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK7-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK7: omp.inner.for.body33: +// CHECK7-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK7: omp.inner.for.body32: // CHECK7-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK7-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK7-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK7-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK7-NEXT: call void @_Z3fn6v() -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK7: omp.body.continue36: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK7: omp.inner.for.inc37: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK7: omp.body.continue35: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK7: omp.inner.for.inc36: // CHECK7-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK7-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK7: omp.inner.for.end39: +// CHECK7-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK7-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK7: omp.inner.for.end38: // CHECK7-NEXT: br label [[OMP_IF_END]] // CHECK7: omp_if.end: // CHECK7-NEXT: store i32 100, ptr [[I20]], align 4 @@ -6582,16 +6582,16 @@ int main() { // CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK11-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR__CASTED12:%.*]] = alloca i64, align 8 -// CHECK11-NEXT: [[DOTBOUND_ZERO_ADDR17:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__CASTED11:%.*]] = alloca i64, align 8 +// CHECK11-NEXT: [[DOTBOUND_ZERO_ADDR16:%.*]] = alloca i32, align 4 // CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK11-NEXT: store ptr [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8 // CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK11-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -6613,29 +6613,29 @@ int main() { // CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK11-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 // CHECK11-NEXT: [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP7]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL1]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE7:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE6:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK11: omp.inner.for.cond: // CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]] // CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]] -// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]] +// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK11: omp.inner.for.body: // CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 // CHECK11-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK11-NEXT: [[FROMBOOL4:%.*]] = zext i1 [[TOBOOL3]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] +// CHECK11-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK11-NEXT: [[STOREDV3:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV3]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL5]], label [[OMP_IF_THEN6:%.*]], label [[OMP_IF_ELSE:%.*]] -// CHECK11: omp_if.then6: +// CHECK11-NEXT: [[LOADEDV4:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11: omp_if.then5: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined, i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]), !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] // CHECK11: omp_if.else: @@ -6654,48 +6654,48 @@ int main() { // CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] // CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_IF_END22:%.*]] -// CHECK11: omp_if.else7: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND8:%.*]] -// CHECK11: omp.inner.for.cond8: +// CHECK11-NEXT: br label [[OMP_IF_END21:%.*]] +// CHECK11: omp_if.else6: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND7:%.*]] +// CHECK11: omp.inner.for.cond7: // CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP9:%.*]] = icmp sle i32 [[TMP20]], [[TMP21]] -// CHECK11-NEXT: br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY10:%.*]], label [[OMP_INNER_FOR_END21:%.*]] -// CHECK11: omp.inner.for.body10: +// CHECK11-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY9:%.*]], label [[OMP_INNER_FOR_END20:%.*]] +// CHECK11: omp.inner.for.body9: // CHECK11-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK11-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK11-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK11-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 // CHECK11-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL11:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK11-NEXT: [[FROMBOOL13:%.*]] = zext i1 [[TOBOOL11]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL13]], ptr [[DOTCAPTURE_EXPR__CASTED12]], align 1 -// CHECK11-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED12]], align 8 +// CHECK11-NEXT: [[LOADEDV10:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK11-NEXT: [[STOREDV12:%.*]] = zext i1 [[LOADEDV10]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV12]], ptr [[DOTCAPTURE_EXPR__CASTED11]], align 1 +// CHECK11-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED11]], align 8 // CHECK11-NEXT: [[TMP28:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL14:%.*]] = trunc i8 [[TMP28]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL14]], label [[OMP_IF_THEN15:%.*]], label [[OMP_IF_ELSE16:%.*]] -// CHECK11: omp_if.then15: +// CHECK11-NEXT: [[LOADEDV13:%.*]] = trunc i8 [[TMP28]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV13]], label [[OMP_IF_THEN14:%.*]], label [[OMP_IF_ELSE15:%.*]] +// CHECK11: omp_if.then14: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1, i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) -// CHECK11-NEXT: br label [[OMP_IF_END18:%.*]] -// CHECK11: omp_if.else16: +// CHECK11-NEXT: br label [[OMP_IF_END17:%.*]] +// CHECK11: omp_if.else15: // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]) // CHECK11-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR17]], align 4 -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]] +// CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR16]], align 4 +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR16]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]) -// CHECK11-NEXT: br label [[OMP_IF_END18]] -// CHECK11: omp_if.end18: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC19:%.*]] -// CHECK11: omp.inner.for.inc19: +// CHECK11-NEXT: br label [[OMP_IF_END17]] +// CHECK11: omp_if.end17: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC18:%.*]] +// CHECK11: omp.inner.for.inc18: // CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK11-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD20:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] -// CHECK11-NEXT: store i32 [[ADD20]], ptr [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP38:![0-9]+]] -// CHECK11: omp.inner.for.end21: -// CHECK11-NEXT: br label [[OMP_IF_END22]] -// CHECK11: omp_if.end22: +// CHECK11-NEXT: [[ADD19:%.*]] = add nsw i32 [[TMP30]], [[TMP31]] +// CHECK11-NEXT: store i32 [[ADD19]], ptr [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND7]], !llvm.loop [[LOOP38:![0-9]+]] +// CHECK11: omp.inner.for.end20: +// CHECK11-NEXT: br label [[OMP_IF_END21]] +// CHECK11: omp_if.end21: // CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) @@ -6740,8 +6740,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -6867,8 +6867,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -7984,60 +7984,60 @@ int main() { // CHECK15-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK15-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK15-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK15-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK15-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK15-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK15-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK15-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK15-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK15-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK15: omp_if.then: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK15: omp.inner.for.cond22: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK15: omp.inner.for.cond21: // CHECK15-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK15-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK15: omp.inner.for.body24: +// CHECK15-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK15-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK15: omp.inner.for.body23: // CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK15-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK15-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK15-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK15-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK15: omp.body.continue27: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK15: omp.inner.for.inc28: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK15: omp.body.continue26: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK15: omp.inner.for.inc27: // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK15-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK15: omp.inner.for.end30: +// CHECK15-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK15-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK15: omp.inner.for.end29: // CHECK15-NEXT: br label [[OMP_IF_END:%.*]] // CHECK15: omp_if.else: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK15: omp.inner.for.cond31: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK15: omp.inner.for.cond30: // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK15-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK15: omp.inner.for.body33: +// CHECK15-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK15: omp.inner.for.body32: // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK15-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK15-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK15-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK15-NEXT: call void @_Z3fn6v() -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK15: omp.body.continue36: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK15: omp.inner.for.inc37: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK15: omp.body.continue35: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK15: omp.inner.for.inc36: // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK15-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK15: omp.inner.for.end39: +// CHECK15-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK15-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK15: omp.inner.for.end38: // CHECK15-NEXT: br label [[OMP_IF_END]] // CHECK15: omp_if.end: // CHECK15-NEXT: store i32 100, ptr [[I20]], align 4 diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp index b88416b36c4fa6..99469d62a9fc13 100644 --- a/clang/test/OpenMP/irbuilder_for_iterator.cpp +++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp @@ -48,8 +48,7 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) { // CHECK-NEXT: call void @_ZN10MyIteratorC1Ej(ptr noundef nonnull align 1 dereferenceable(1) [[IT]], i32 noundef 7) // CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 // CHECK-NEXT: store ptr [[IT]], ptr [[TMP0]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[AGG_CAPTURED1]], i32 0, i32 0 -// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], ptr noundef nonnull align 1 dereferenceable(1) [[IT]]) +// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_CAPTURED1]], ptr noundef nonnull align 1 dereferenceable(1) [[IT]]) // CHECK-NEXT: call void @__captured_stmt(ptr [[DOTCOUNT_ADDR]], ptr [[AGG_CAPTURED]]) // CHECK-NEXT: [[DOTCOUNT:%.*]] = load i64, ptr [[DOTCOUNT_ADDR]], align 8 // CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] @@ -155,11 +154,10 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) { // CHECK-NEXT: store i64 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 8 // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP1]] // CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[MUL]] to i32 -// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], i32 noundef [[CONV]]) +// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]], i32 noundef [[CONV]]) // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 // CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 1 dereferenceable(1) ptr @_ZN10MyIteratoraSERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP3]], ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]]) // CHECK-NEXT: ret void diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp index 6bf91bfda138af..6bf44e2ee41536 100644 --- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp +++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp @@ -66,8 +66,7 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) { // CHECK-NEXT: store ptr [[__BEGIN2]], ptr [[TMP2]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 1 // CHECK-NEXT: store ptr [[__END2]], ptr [[TMP3]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[AGG_CAPTURED1]], i32 0, i32 0 -// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP4]], ptr noundef nonnull align 1 dereferenceable(1) [[__BEGIN2]]) +// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_CAPTURED1]], ptr noundef nonnull align 1 dereferenceable(1) [[__BEGIN2]]) // CHECK-NEXT: call void @__captured_stmt(ptr [[DOTCOUNT_ADDR]], ptr [[AGG_CAPTURED]]) // CHECK-NEXT: [[DOTCOUNT:%.*]] = load i64, ptr [[DOTCOUNT_ADDR]], align 8 // CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] @@ -173,13 +172,12 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) { // CHECK-NEXT: store i64 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 8 // CHECK-NEXT: store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP1]] // CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[MUL]] to i32 -// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], i32 noundef [[CONV]]) +// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]], i32 noundef [[CONV]]) // CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZNK10MyIteratordeEv(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]]) -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 -// CHECK-NEXT: store i32 [[CALL]], ptr [[TMP3]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CALL]], ptr [[TMP2]], align 4 // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_codegen.cpp index 0f43f0ac71704a..6387946fcf5448 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_simd_codegen.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_simd_codegen.cpp @@ -66,8 +66,8 @@ struct S { // CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i8, align 1 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED6:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED5:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED7:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) @@ -88,21 +88,21 @@ struct S { // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP6]]) // CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_3]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_3]], align 1 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_4]], align 4 // CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK1-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP9]] to i1 -// CHECK1-NEXT: [[FROMBOOL7:%.*]] = zext i1 [[TOBOOL5]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL7]], ptr [[DOTCAPTURE_EXPR__CASTED6]], align 1 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED6]], align 8 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP9]] to i1 +// CHECK1-NEXT: [[STOREDV6:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV6]], ptr [[DOTCAPTURE_EXPR__CASTED5]], align 1 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED5]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8 +// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED7]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED7]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK1-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP13]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL9]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV8:%.*]] = trunc i8 [[TMP13]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV8]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @main.omp_outlined.4, ptr [[I]], ptr [[ARGC_ADDR]], ptr [[ARGV_ADDR]], i64 [[TMP10]], i64 [[TMP12]]) // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -199,34 +199,34 @@ struct S { // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !14 -// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META14:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META14]] +// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] // CHECK1-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK1-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] // CHECK1-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 -// CHECK1-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !14 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] +// CHECK1-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META14]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK1-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK1-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK1-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP15:![0-9]+]] // CHECK1: .omp_outlined..exit: // CHECK1-NEXT: ret i32 0 @@ -310,34 +310,34 @@ struct S { // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !31 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !31 -// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META31:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META31]] +// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] // CHECK1-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31 +// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32:![0-9]+]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32:![0-9]+]] // CHECK1-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__2_EXIT:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK1-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK1-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP33:![0-9]+]] // CHECK1: .omp_outlined..2.exit: // CHECK1-NEXT: ret i32 0 @@ -420,8 +420,8 @@ struct S { // CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP24]], ptr align 8 [[AGG_CAPTURED]], i64 24, i1 false) // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_3]], ptr [[TMP21]], i32 0, i32 1 // CHECK1-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK1-NEXT: [[TMP27:%.*]] = sext i1 [[TOBOOL]] to i32 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK1-NEXT: [[TMP27:%.*]] = sext i1 [[LOADEDV]] to i32 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP22]], i32 0, i32 5 // CHECK1-NEXT: store i64 0, ptr [[TMP28]], align 8 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP22]], i32 0, i32 6 @@ -505,31 +505,31 @@ struct S { // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META41:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !47 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 -// CHECK1-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META47:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] // CHECK1-NEXT: call void [[TMP20]](ptr [[TMP21]], ptr [[DOTLASTPRIV_PTR_ADDR_I]]) #[[ATTR2]] // CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP19]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias !47 +// CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias [[META47]] // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_ANON_2:%.*]], ptr [[TMP19]], i32 0, i32 1 // CHECK1-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 // CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK1-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK1-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 -// CHECK1-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK1-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK1-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8 // CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP31]], align 8 @@ -546,63 +546,63 @@ struct S { // CHECK1-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[IDXPROM4_I]] // CHECK1-NEXT: [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX5_I]], align 1 // CHECK1-NEXT: [[CONV_I:%.*]] = sext i8 [[TMP40]] to i32 -// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP41]] to i64 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[SUB8_I:%.*]] = sub i32 [[TMP42]], [[TMP43]] // CHECK1-NEXT: [[SUB9_I:%.*]] = sub i32 [[SUB8_I]], 1 // CHECK1-NEXT: [[CONV11_I:%.*]] = zext i32 [[SUB8_I]] to i64 // CHECK1-NEXT: [[MUL_I:%.*]] = mul nsw i64 [[CONV7_I]], [[CONV11_I]] // CHECK1-NEXT: [[SUB12_I:%.*]] = sub nsw i64 [[MUL_I]], 1 -// CHECK1-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !47 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK1-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias !47 -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK1-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP45]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[TASKLOOP_IF_END_I:%.*]] // CHECK1: land.lhs.true.i: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]] // CHECK1-NEXT: br i1 [[CMP13_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[TASKLOOP_IF_END_I]] // CHECK1: taskloop.if.then.i: -// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK1-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47 +// CHECK1-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK1-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] // CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK1-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP49]], align 8 // CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK1-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP51]], align 8 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48:![0-9]+]] -// CHECK1-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48:![0-9]+]] +// CHECK1-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[CMP16_I:%.*]] = icmp ule i64 [[TMP53]], [[TMP54]] // CHECK1-NEXT: br i1 [[CMP16_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[SUB17_I:%.*]] = sub i32 [[TMP56]], [[TMP57]] // CHECK1-NEXT: [[SUB18_I:%.*]] = sub i32 [[SUB17_I]], 1 // CHECK1-NEXT: [[CONV22_I:%.*]] = zext i32 [[SUB17_I]] to i64 // CHECK1-NEXT: [[DIV23_I:%.*]] = sdiv i64 [[TMP55]], [[CONV22_I]] // CHECK1-NEXT: [[CONV26_I:%.*]] = trunc i64 [[DIV23_I]] to i32 -// CHECK1-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[CONV27_I:%.*]] = sext i32 [[TMP58]] to i64 -// CHECK1-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP60:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP60:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[SUB28_I:%.*]] = sub i32 [[TMP61]], [[TMP62]] // CHECK1-NEXT: [[SUB29_I:%.*]] = sub i32 [[SUB28_I]], 1 // CHECK1-NEXT: [[CONV33_I:%.*]] = zext i32 [[SUB28_I]] to i64 // CHECK1-NEXT: [[DIV34_I:%.*]] = sdiv i64 [[TMP60]], [[CONV33_I]] -// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[SUB35_I:%.*]] = sub i32 [[TMP63]], [[TMP64]] // CHECK1-NEXT: [[SUB36_I:%.*]] = sub i32 [[SUB35_I]], 1 // CHECK1-NEXT: [[CONV40_I:%.*]] = zext i32 [[SUB35_I]] to i64 @@ -610,15 +610,15 @@ struct S { // CHECK1-NEXT: [[SUB42_I:%.*]] = sub nsw i64 [[TMP59]], [[MUL41_I]] // CHECK1-NEXT: [[ADD44_I:%.*]] = add nsw i64 [[CONV27_I]], [[SUB42_I]] // CHECK1-NEXT: [[CONV45_I:%.*]] = trunc i64 [[ADD44_I]] to i32 -// CHECK1-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK1-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: [[ADD46_I:%.*]] = add nsw i64 [[TMP65]], 1 -// CHECK1-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK1-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP49:![0-9]+]] // CHECK1: omp.inner.for.end.i: // CHECK1-NEXT: br label [[TASKLOOP_IF_END_I]] // CHECK1: taskloop.if.end.i: -// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 +// CHECK1-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] // CHECK1-NEXT: [[TMP67:%.*]] = icmp ne i32 [[TMP66]], 0 // CHECK1-NEXT: br i1 [[TMP67]], label [[DOTOMP_LASTPRIVATE_THEN_I:%.*]], label [[DOTOMP_OUTLINED__5_EXIT:%.*]] // CHECK1: .omp.lastprivate.then.i: @@ -677,12 +677,12 @@ struct S { // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK1-NEXT: [[FROMBOOL3:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL3]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK1-NEXT: [[STOREDV2:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @_ZN1SC2Ei.omp_outlined, ptr [[THIS1]], ptr [[C_ADDR]], i64 [[TMP2]]) // CHECK1-NEXT: ret void @@ -720,7 +720,7 @@ struct S { // CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP7]], align 8 // CHECK1-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1 // CHECK1-NEXT: store ptr [[TMP]], ptr [[_TMP1]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_2]], align 4 @@ -729,7 +729,7 @@ struct S { // CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 // CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 // CHECK1-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TOBOOL]], i32 2, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[LOADEDV]], i32 2, i32 0 // CHECK1-NEXT: [[TMP12:%.*]] = or i32 [[TMP11]], 1 // CHECK1-NEXT: [[TMP13:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP3]], i32 [[TMP12]], i64 80, i64 16, ptr @.omp_task_entry..8) // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], ptr [[TMP13]], i32 0, i32 0 @@ -803,54 +803,54 @@ struct S { // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META56:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META58:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !62 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !62 -// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !62 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !62 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META62:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META62]] +// CHECK1-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META62]] // CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8 -// CHECK1-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias !62 +// CHECK1-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias [[META62]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANON_4:%.*]], ptr [[TMP18]], i32 0, i32 1 // CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 8 // CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 -// CHECK1-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 +// CHECK1-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] // CHECK1-NEXT: [[SUB3_I:%.*]] = sub nsw i32 [[TMP23]], 1 -// CHECK1-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !62 -// CHECK1-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias !62 -// CHECK1-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias !62 +// CHECK1-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META62]] +// CHECK1-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias [[META62]] // CHECK1-NEXT: store i32 0, ptr [[TMP24]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] // CHECK1-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP25]] // CHECK1-NEXT: br i1 [[CMP_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[DOTOMP_OUTLINED__7_EXIT:%.*]] // CHECK1: taskloop.if.then.i: -// CHECK1-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias !62 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !62 +// CHECK1-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias [[META62]] +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META62]] // CHECK1-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP26]] to i32 -// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !62 +// CHECK1-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_4]], ptr [[TMP18]], i32 0, i32 1 // CHECK1-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK1: omp.inner.for.cond.i: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63:![0-9]+]] +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63:![0-9]+]] // CHECK1-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP29]] to i64 -// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK1-NEXT: [[CMP8_I:%.*]] = icmp ule i64 [[CONV7_I]], [[TMP30]] // CHECK1-NEXT: br i1 [[CMP8_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK1: omp.inner.for.body.i: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] -// CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] +// CHECK1-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK1-NEXT: store i32 [[TMP31]], ptr [[TMP32]], align 4, !llvm.access.group [[ACC_GRP63]] -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK1-NEXT: [[ADD9_I:%.*]] = add nsw i32 [[TMP33]], 1 -// CHECK1-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK1-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP64:![0-9]+]] // CHECK1: omp.inner.for.end.i: // CHECK1-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT]] @@ -878,8 +878,8 @@ struct S { // CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i8, align 1 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED6:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED5:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED7:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) @@ -900,21 +900,21 @@ struct S { // CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP6]]) // CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK2-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0 -// CHECK2-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK2-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_3]], align 1 +// CHECK2-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK2-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_3]], align 1 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK2-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_4]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK2-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP9]] to i1 -// CHECK2-NEXT: [[FROMBOOL7:%.*]] = zext i1 [[TOBOOL5]] to i8 -// CHECK2-NEXT: store i8 [[FROMBOOL7]], ptr [[DOTCAPTURE_EXPR__CASTED6]], align 1 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED6]], align 8 +// CHECK2-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP9]] to i1 +// CHECK2-NEXT: [[STOREDV6:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK2-NEXT: store i8 [[STOREDV6]], ptr [[DOTCAPTURE_EXPR__CASTED5]], align 1 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED5]], align 8 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8 +// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED7]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED7]], align 8 // CHECK2-NEXT: [[TMP13:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK2-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP13]] to i1 -// CHECK2-NEXT: br i1 [[TOBOOL9]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK2-NEXT: [[LOADEDV8:%.*]] = trunc i8 [[TMP13]] to i1 +// CHECK2-NEXT: br i1 [[LOADEDV8]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK2: omp_if.then: // CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @main.omp_outlined.4, ptr [[I]], ptr [[ARGC_ADDR]], ptr [[ARGV_ADDR]], i64 [[TMP10]], i64 [[TMP12]]) // CHECK2-NEXT: br label [[OMP_IF_END:%.*]] @@ -1011,34 +1011,34 @@ struct S { // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !14 -// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META14:![0-9]+]] +// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META14]] +// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] // CHECK2-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK2: omp.inner.for.cond.i: -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK2-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] // CHECK2-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK2-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK2: omp.inner.for.body.i: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 -// CHECK2-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !14 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] +// CHECK2-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META14]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK2-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK2-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK2-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP15:![0-9]+]] // CHECK2: .omp_outlined..exit: // CHECK2-NEXT: ret i32 0 @@ -1122,34 +1122,34 @@ struct S { // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !31 -// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !31 -// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META31:![0-9]+]] +// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META31]] +// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] // CHECK2-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31 +// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK2: omp.inner.for.cond.i: -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32:![0-9]+]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32:![0-9]+]] // CHECK2-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK2-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK2-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__2_EXIT:%.*]] // CHECK2: omp.inner.for.body.i: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK2-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK2-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK2-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK2-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK2-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP33:![0-9]+]] // CHECK2: .omp_outlined..2.exit: // CHECK2-NEXT: ret i32 0 @@ -1232,8 +1232,8 @@ struct S { // CHECK2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP24]], ptr align 8 [[AGG_CAPTURED]], i64 24, i1 false) // CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_3]], ptr [[TMP21]], i32 0, i32 1 // CHECK2-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK2-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK2-NEXT: [[TMP27:%.*]] = sext i1 [[TOBOOL]] to i32 +// CHECK2-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK2-NEXT: [[TMP27:%.*]] = sext i1 [[LOADEDV]] to i32 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP22]], i32 0, i32 5 // CHECK2-NEXT: store i64 0, ptr [[TMP28]], align 8 // CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP22]], i32 0, i32 6 @@ -1317,31 +1317,31 @@ struct S { // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META41:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !47 -// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 -// CHECK2-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META47:![0-9]+]] +// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] // CHECK2-NEXT: call void [[TMP20]](ptr [[TMP21]], ptr [[DOTLASTPRIV_PTR_ADDR_I]]) #[[ATTR2]] // CHECK2-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP19]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias !47 +// CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias [[META47]] // CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_ANON_2:%.*]], ptr [[TMP19]], i32 0, i32 1 // CHECK2-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 // CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK2-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK2-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 -// CHECK2-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK2-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK2-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8 // CHECK2-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP31]], align 8 @@ -1358,63 +1358,63 @@ struct S { // CHECK2-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[IDXPROM4_I]] // CHECK2-NEXT: [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX5_I]], align 1 // CHECK2-NEXT: [[CONV_I:%.*]] = sext i8 [[TMP40]] to i32 -// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP41]] to i64 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[SUB8_I:%.*]] = sub i32 [[TMP42]], [[TMP43]] // CHECK2-NEXT: [[SUB9_I:%.*]] = sub i32 [[SUB8_I]], 1 // CHECK2-NEXT: [[CONV11_I:%.*]] = zext i32 [[SUB8_I]] to i64 // CHECK2-NEXT: [[MUL_I:%.*]] = mul nsw i64 [[CONV7_I]], [[CONV11_I]] // CHECK2-NEXT: [[SUB12_I:%.*]] = sub nsw i64 [[MUL_I]], 1 -// CHECK2-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !47 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK2-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias !47 -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK2-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP45]] // CHECK2-NEXT: br i1 [[CMP_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[TASKLOOP_IF_END_I:%.*]] // CHECK2: land.lhs.true.i: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]] // CHECK2-NEXT: br i1 [[CMP13_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[TASKLOOP_IF_END_I]] // CHECK2: taskloop.if.then.i: -// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK2-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47 +// CHECK2-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK2-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] // CHECK2-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK2-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP49]], align 8 // CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK2-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP51]], align 8 // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK2: omp.inner.for.cond.i: -// CHECK2-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48:![0-9]+]] -// CHECK2-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP53:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48:![0-9]+]] +// CHECK2-NEXT: [[TMP54:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[CMP16_I:%.*]] = icmp ule i64 [[TMP53]], [[TMP54]] // CHECK2-NEXT: br i1 [[CMP16_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK2: omp.inner.for.body.i: -// CHECK2-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[SUB17_I:%.*]] = sub i32 [[TMP56]], [[TMP57]] // CHECK2-NEXT: [[SUB18_I:%.*]] = sub i32 [[SUB17_I]], 1 // CHECK2-NEXT: [[CONV22_I:%.*]] = zext i32 [[SUB17_I]] to i64 // CHECK2-NEXT: [[DIV23_I:%.*]] = sdiv i64 [[TMP55]], [[CONV22_I]] // CHECK2-NEXT: [[CONV26_I:%.*]] = trunc i64 [[DIV23_I]] to i32 -// CHECK2-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[CONV27_I:%.*]] = sext i32 [[TMP58]] to i64 -// CHECK2-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP60:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP60:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP62:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[SUB28_I:%.*]] = sub i32 [[TMP61]], [[TMP62]] // CHECK2-NEXT: [[SUB29_I:%.*]] = sub i32 [[SUB28_I]], 1 // CHECK2-NEXT: [[CONV33_I:%.*]] = zext i32 [[SUB28_I]] to i64 // CHECK2-NEXT: [[DIV34_I:%.*]] = sdiv i64 [[TMP60]], [[CONV33_I]] -// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[SUB35_I:%.*]] = sub i32 [[TMP63]], [[TMP64]] // CHECK2-NEXT: [[SUB36_I:%.*]] = sub i32 [[SUB35_I]], 1 // CHECK2-NEXT: [[CONV40_I:%.*]] = zext i32 [[SUB35_I]] to i64 @@ -1422,15 +1422,15 @@ struct S { // CHECK2-NEXT: [[SUB42_I:%.*]] = sub nsw i64 [[TMP59]], [[MUL41_I]] // CHECK2-NEXT: [[ADD44_I:%.*]] = add nsw i64 [[CONV27_I]], [[SUB42_I]] // CHECK2-NEXT: [[CONV45_I:%.*]] = trunc i64 [[ADD44_I]] to i32 -// CHECK2-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK2-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: [[TMP65:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: [[ADD46_I:%.*]] = add nsw i64 [[TMP65]], 1 -// CHECK2-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK2-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP49:![0-9]+]] // CHECK2: omp.inner.for.end.i: // CHECK2-NEXT: br label [[TASKLOOP_IF_END_I]] // CHECK2: taskloop.if.end.i: -// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 +// CHECK2-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] // CHECK2-NEXT: [[TMP67:%.*]] = icmp ne i32 [[TMP66]], 0 // CHECK2-NEXT: br i1 [[TMP67]], label [[DOTOMP_LASTPRIVATE_THEN_I:%.*]], label [[DOTOMP_OUTLINED__5_EXIT:%.*]] // CHECK2: .omp.lastprivate.then.i: @@ -1489,12 +1489,12 @@ struct S { // CHECK2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK2-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK2-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK2-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK2-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK2-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK2-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK2-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK2-NEXT: [[FROMBOOL3:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK2-NEXT: store i8 [[FROMBOOL3]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK2-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK2-NEXT: [[STOREDV2:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK2-NEXT: store i8 [[STOREDV2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK2-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @_ZN1SC2Ei.omp_outlined, ptr [[THIS1]], ptr [[C_ADDR]], i64 [[TMP2]]) // CHECK2-NEXT: ret void @@ -1532,7 +1532,7 @@ struct S { // CHECK2-NEXT: store ptr [[TMP1]], ptr [[TMP7]], align 8 // CHECK2-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK2-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK2-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1 +// CHECK2-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1 // CHECK2-NEXT: store ptr [[TMP]], ptr [[_TMP1]], align 8 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_2]], align 4 @@ -1541,7 +1541,7 @@ struct S { // CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 // CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 // CHECK2-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[TOBOOL]], i32 2, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[LOADEDV]], i32 2, i32 0 // CHECK2-NEXT: [[TMP12:%.*]] = or i32 [[TMP11]], 1 // CHECK2-NEXT: [[TMP13:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP3]], i32 [[TMP12]], i64 80, i64 16, ptr @.omp_task_entry..8) // CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], ptr [[TMP13]], i32 0, i32 0 @@ -1615,54 +1615,54 @@ struct S { // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META56:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META58:![0-9]+]]) // CHECK2-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !62 -// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !62 -// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !62 -// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !62 +// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META62:![0-9]+]] +// CHECK2-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META62]] +// CHECK2-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META62]] // CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8 -// CHECK2-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias !62 +// CHECK2-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias [[META62]] // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANON_4:%.*]], ptr [[TMP18]], i32 0, i32 1 // CHECK2-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 8 // CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 -// CHECK2-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 +// CHECK2-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] // CHECK2-NEXT: [[SUB3_I:%.*]] = sub nsw i32 [[TMP23]], 1 -// CHECK2-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !62 -// CHECK2-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias !62 -// CHECK2-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias !62 +// CHECK2-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META62]] +// CHECK2-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias [[META62]] // CHECK2-NEXT: store i32 0, ptr [[TMP24]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !62 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META62]] // CHECK2-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP25]] // CHECK2-NEXT: br i1 [[CMP_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[DOTOMP_OUTLINED__7_EXIT:%.*]] // CHECK2: taskloop.if.then.i: -// CHECK2-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias !62 -// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !62 +// CHECK2-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias [[META62]] +// CHECK2-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META62]] // CHECK2-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP26]] to i32 -// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !62 +// CHECK2-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]] // CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_4]], ptr [[TMP18]], i32 0, i32 1 // CHECK2-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK2: omp.inner.for.cond.i: -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63:![0-9]+]] +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63:![0-9]+]] // CHECK2-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP29]] to i64 -// CHECK2-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK2-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK2-NEXT: [[CMP8_I:%.*]] = icmp ule i64 [[CONV7_I]], [[TMP30]] // CHECK2-NEXT: br i1 [[CMP8_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK2: omp.inner.for.body.i: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] -// CHECK2-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] +// CHECK2-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK2-NEXT: store i32 [[TMP31]], ptr [[TMP32]], align 4, !llvm.access.group [[ACC_GRP63]] -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK2-NEXT: [[ADD9_I:%.*]] = add nsw i32 [[TMP33]], 1 -// CHECK2-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !62, !llvm.access.group [[ACC_GRP63]] +// CHECK2-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META62]], !llvm.access.group [[ACC_GRP63]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP64:![0-9]+]] // CHECK2: omp.inner.for.end.i: // CHECK2-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT]] @@ -1690,8 +1690,8 @@ struct S { // CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i8, align 1 // CHECK3-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED6:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED5:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED7:%.*]] = alloca i64, align 8 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) @@ -1712,21 +1712,21 @@ struct S { // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 1, ptr @main.omp_outlined.1, i64 [[TMP6]]) // CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP7]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_3]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_3]], align 1 // CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP8]], ptr [[DOTCAPTURE_EXPR_4]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK3-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP9]] to i1 -// CHECK3-NEXT: [[FROMBOOL7:%.*]] = zext i1 [[TOBOOL5]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL7]], ptr [[DOTCAPTURE_EXPR__CASTED6]], align 1 -// CHECK3-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED6]], align 8 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP9]] to i1 +// CHECK3-NEXT: [[STOREDV6:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV6]], ptr [[DOTCAPTURE_EXPR__CASTED5]], align 1 +// CHECK3-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED5]], align 8 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8 +// CHECK3-NEXT: store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR__CASTED7]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED7]], align 8 // CHECK3-NEXT: [[TMP13:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_3]], align 1 -// CHECK3-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP13]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL9]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV8:%.*]] = trunc i8 [[TMP13]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV8]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @main.omp_outlined.4, ptr [[I]], ptr [[ARGC_ADDR]], ptr [[ARGV_ADDR]], i64 [[TMP10]], i64 [[TMP12]]) // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] @@ -1823,34 +1823,34 @@ struct S { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !14 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META14:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META14]] // CHECK3-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK3: omp.inner.for.cond.i: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK3-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !14 +// CHECK3-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META14]] // CHECK3-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK3-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: omp.inner.for.body.i: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 -// CHECK3-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !14 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK3-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP15:![0-9]+]] // CHECK3: .omp_outlined..exit: // CHECK3-NEXT: ret i32 0 @@ -1934,34 +1934,34 @@ struct S { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !31 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !31 -// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !31 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !31 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META31:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META31]] +// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META31]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META31]] // CHECK3-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31 +// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK3: omp.inner.for.cond.i: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32:![0-9]+]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32:![0-9]+]] // CHECK3-NEXT: [[CONV1_I:%.*]] = sext i32 [[TMP20]] to i64 -// CHECK3-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK3-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK3-NEXT: [[CMP_I:%.*]] = icmp ule i64 [[CONV1_I]], [[TMP21]] // CHECK3-NEXT: br i1 [[CMP_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__2_EXIT:%.*]] // CHECK3: omp.inner.for.body.i: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK3-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK3-NEXT: store i32 [[TMP22]], ptr [[I_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK3-NEXT: [[ADD2_I:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK3-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !31, !llvm.access.group [[ACC_GRP32]] +// CHECK3-NEXT: store i32 [[ADD2_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META31]], !llvm.access.group [[ACC_GRP32]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP33:![0-9]+]] // CHECK3: .omp_outlined..2.exit: // CHECK3-NEXT: ret i32 0 @@ -2008,9 +2008,9 @@ struct S { // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP9]], align 8 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[AGG_CAPTURED]], i32 0, i32 3 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[TMP10]], align 8 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[TMP10]], align 8 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4 // CHECK3-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -2049,8 +2049,8 @@ struct S { // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP26]], ptr align 8 [[AGG_CAPTURED]], i64 32, i1 false) // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_3]], ptr [[TMP23]], i32 0, i32 1 // CHECK3-NEXT: [[TMP28:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL16:%.*]] = trunc i8 [[TMP28]] to i1 -// CHECK3-NEXT: [[TMP29:%.*]] = sext i1 [[TOBOOL16]] to i32 +// CHECK3-NEXT: [[LOADEDV16:%.*]] = trunc i8 [[TMP28]] to i1 +// CHECK3-NEXT: [[TMP29:%.*]] = sext i1 [[LOADEDV16]] to i32 // CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP24]], i32 0, i32 5 // CHECK3-NEXT: store i64 0, ptr [[TMP30]], align 8 // CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP24]], i32 0, i32 6 @@ -2134,31 +2134,31 @@ struct S { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META41:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !47 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 -// CHECK3-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !47 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META47:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i64 [[TMP10]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i64 [[TMP12]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i64 [[TMP14]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i32 [[TMP16]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: store ptr [[TMP18]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: call void [[TMP20]](ptr [[TMP21]], ptr [[DOTLASTPRIV_PTR_ADDR_I]]) #[[ATTR2]] // CHECK3-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP19]], align 8 -// CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias !47 +// CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTLASTPRIV_PTR_ADDR_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_ANON_2:%.*]], ptr [[TMP19]], i32 0, i32 1 // CHECK3-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8 // CHECK3-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 -// CHECK3-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK3-NEXT: store i32 [[TMP26]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK3-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 -// CHECK3-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: store i32 [[TMP29]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK3-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8 // CHECK3-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP31]], align 8 @@ -2175,68 +2175,68 @@ struct S { // CHECK3-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[IDXPROM4_I]] // CHECK3-NEXT: [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX5_I]], align 1 // CHECK3-NEXT: [[CONV_I:%.*]] = sext i8 [[TMP40]] to i32 -// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP41]] to i64 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[SUB8_I:%.*]] = sub i32 [[TMP42]], [[TMP43]] // CHECK3-NEXT: [[SUB9_I:%.*]] = sub i32 [[SUB8_I]], 1 // CHECK3-NEXT: [[CONV11_I:%.*]] = zext i32 [[SUB8_I]] to i64 // CHECK3-NEXT: [[MUL_I:%.*]] = mul nsw i64 [[CONV7_I]], [[CONV11_I]] // CHECK3-NEXT: [[SUB12_I:%.*]] = sub nsw i64 [[MUL_I]], 1 -// CHECK3-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK3-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !47 +// CHECK3-NEXT: store i64 [[SUB12_I]], ptr [[DOTCAPTURE_EXPR_6_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: store i32 [[TMP44]], ptr [[J_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP45]] // CHECK3-NEXT: br i1 [[CMP_I]], label [[LAND_LHS_TRUE_I:%.*]], label [[TASKLOOP_IF_END_I:%.*]] // CHECK3: land.lhs.true.i: -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]] // CHECK3-NEXT: br i1 [[CMP13_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[TASKLOOP_IF_END_I]] // CHECK3: taskloop.if.then.i: -// CHECK3-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !47 -// CHECK3-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47 +// CHECK3-NEXT: [[TMP48:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: store i64 [[TMP48]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 1 // CHECK3-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP49]], align 8 // CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 2 // CHECK3-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP51]], align 8 // CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_ANON_2]], ptr [[TMP19]], i32 0, i32 3 // CHECK3-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 -// CHECK3-NEXT: [[TOBOOL_I:%.*]] = trunc i8 [[TMP54]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL_I]], label [[OMP_IF_THEN_I:%.*]], label [[OMP_IF_ELSE_I:%.*]] +// CHECK3-NEXT: [[LOADEDV_I:%.*]] = trunc i8 [[TMP54]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV_I]], label [[OMP_IF_THEN_I:%.*]], label [[OMP_IF_ELSE_I:%.*]] // CHECK3: omp_if.then.i: // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK3: omp.inner.for.cond.i: -// CHECK3-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48:![0-9]+]] -// CHECK3-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP55:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48:![0-9]+]] +// CHECK3-NEXT: [[TMP56:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[CMP16_I:%.*]] = icmp ule i64 [[TMP55]], [[TMP56]] // CHECK3-NEXT: br i1 [[CMP16_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK3: omp.inner.for.body.i: -// CHECK3-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[SUB17_I:%.*]] = sub i32 [[TMP58]], [[TMP59]] // CHECK3-NEXT: [[SUB18_I:%.*]] = sub i32 [[SUB17_I]], 1 // CHECK3-NEXT: [[CONV22_I:%.*]] = zext i32 [[SUB17_I]] to i64 // CHECK3-NEXT: [[DIV23_I:%.*]] = sdiv i64 [[TMP57]], [[CONV22_I]] // CHECK3-NEXT: [[CONV26_I:%.*]] = trunc i64 [[DIV23_I]] to i32 -// CHECK3-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: store i32 [[CONV26_I]], ptr [[I14_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[CONV27_I:%.*]] = sext i32 [[TMP60]] to i64 -// CHECK3-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP61:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP62:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP63:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[SUB28_I:%.*]] = sub i32 [[TMP63]], [[TMP64]] // CHECK3-NEXT: [[SUB29_I:%.*]] = sub i32 [[SUB28_I]], 1 // CHECK3-NEXT: [[CONV33_I:%.*]] = zext i32 [[SUB28_I]] to i64 // CHECK3-NEXT: [[DIV34_I:%.*]] = sdiv i64 [[TMP62]], [[CONV33_I]] -// CHECK3-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP65:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP66:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[SUB35_I:%.*]] = sub i32 [[TMP65]], [[TMP66]] // CHECK3-NEXT: [[SUB36_I:%.*]] = sub i32 [[SUB35_I]], 1 // CHECK3-NEXT: [[CONV40_I:%.*]] = zext i32 [[SUB35_I]] to i64 @@ -2244,42 +2244,42 @@ struct S { // CHECK3-NEXT: [[SUB42_I:%.*]] = sub nsw i64 [[TMP61]], [[MUL41_I]] // CHECK3-NEXT: [[ADD44_I:%.*]] = add nsw i64 [[CONV27_I]], [[SUB42_I]] // CHECK3-NEXT: [[CONV45_I:%.*]] = trunc i64 [[ADD44_I]] to i32 -// CHECK3-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias !47, !llvm.access.group [[ACC_GRP48]] -// CHECK3-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: store i32 [[CONV45_I]], ptr [[J15_I]], align 4, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: [[TMP67:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: [[ADD46_I:%.*]] = add nsw i64 [[TMP67]], 1 -// CHECK3-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47, !llvm.access.group [[ACC_GRP48]] +// CHECK3-NEXT: store i64 [[ADD46_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]], !llvm.access.group [[ACC_GRP48]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP49:![0-9]+]] // CHECK3: omp.inner.for.end.i: // CHECK3-NEXT: br label [[OMP_IF_END_I:%.*]] // CHECK3: omp_if.else.i: // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND47_I:%.*]] // CHECK3: omp.inner.for.cond47.i: -// CHECK3-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP69:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !47 +// CHECK3-NEXT: [[TMP68:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP69:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: [[CMP48_I:%.*]] = icmp ule i64 [[TMP68]], [[TMP69]] // CHECK3-NEXT: br i1 [[CMP48_I]], label [[OMP_INNER_FOR_BODY49_I:%.*]], label [[OMP_INNER_FOR_END82_I:%.*]] // CHECK3: omp.inner.for.body49.i: -// CHECK3-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP71:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP72:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[SUB50_I:%.*]] = sub i32 [[TMP71]], [[TMP72]] // CHECK3-NEXT: [[SUB51_I:%.*]] = sub i32 [[SUB50_I]], 1 // CHECK3-NEXT: [[CONV55_I:%.*]] = zext i32 [[SUB50_I]] to i64 // CHECK3-NEXT: [[DIV56_I:%.*]] = sdiv i64 [[TMP70]], [[CONV55_I]] // CHECK3-NEXT: [[CONV59_I:%.*]] = trunc i64 [[DIV56_I]] to i32 -// CHECK3-NEXT: store i32 [[CONV59_I]], ptr [[I14_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: store i32 [[CONV59_I]], ptr [[I14_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP73:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[CONV60_I:%.*]] = sext i32 [[TMP73]] to i64 -// CHECK3-NEXT: [[TMP74:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47 -// CHECK3-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP74:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP75:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] +// CHECK3-NEXT: [[TMP76:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP77:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[SUB61_I:%.*]] = sub i32 [[TMP76]], [[TMP77]] // CHECK3-NEXT: [[SUB62_I:%.*]] = sub i32 [[SUB61_I]], 1 // CHECK3-NEXT: [[CONV66_I:%.*]] = zext i32 [[SUB61_I]] to i64 // CHECK3-NEXT: [[DIV67_I:%.*]] = sdiv i64 [[TMP75]], [[CONV66_I]] -// CHECK3-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP78:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP79:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[SUB68_I:%.*]] = sub i32 [[TMP78]], [[TMP79]] // CHECK3-NEXT: [[SUB69_I:%.*]] = sub i32 [[SUB68_I]], 1 // CHECK3-NEXT: [[CONV73_I:%.*]] = zext i32 [[SUB68_I]] to i64 @@ -2287,17 +2287,17 @@ struct S { // CHECK3-NEXT: [[SUB75_I:%.*]] = sub nsw i64 [[TMP74]], [[MUL74_I]] // CHECK3-NEXT: [[ADD77_I:%.*]] = add nsw i64 [[CONV60_I]], [[SUB75_I]] // CHECK3-NEXT: [[CONV78_I:%.*]] = trunc i64 [[ADD77_I]] to i32 -// CHECK3-NEXT: store i32 [[CONV78_I]], ptr [[J15_I]], align 4, !noalias !47 -// CHECK3-NEXT: [[TMP80:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias !47 +// CHECK3-NEXT: store i32 [[CONV78_I]], ptr [[J15_I]], align 4, !noalias [[META47]] +// CHECK3-NEXT: [[TMP80:%.*]] = load i64, ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: [[ADD81_I:%.*]] = add nsw i64 [[TMP80]], 1 -// CHECK3-NEXT: store i64 [[ADD81_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias !47 +// CHECK3-NEXT: store i64 [[ADD81_I]], ptr [[DOTOMP_IV_I]], align 8, !noalias [[META47]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND47_I]], !llvm.loop [[LOOP51:![0-9]+]] // CHECK3: omp.inner.for.end82.i: // CHECK3-NEXT: br label [[OMP_IF_END_I]] // CHECK3: omp_if.end.i: // CHECK3-NEXT: br label [[TASKLOOP_IF_END_I]] // CHECK3: taskloop.if.end.i: -// CHECK3-NEXT: [[TMP81:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias !47 +// CHECK3-NEXT: [[TMP81:%.*]] = load i32, ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META47]] // CHECK3-NEXT: [[TMP82:%.*]] = icmp ne i32 [[TMP81]], 0 // CHECK3-NEXT: br i1 [[TMP82]], label [[DOTOMP_LASTPRIVATE_THEN_I:%.*]], label [[DOTOMP_OUTLINED__5_EXIT:%.*]] // CHECK3: .omp.lastprivate.then.i: @@ -2356,12 +2356,12 @@ struct S { // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK3-NEXT: [[FROMBOOL3:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL3]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK3-NEXT: [[STOREDV2:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @_ZN1SC2Ei.omp_outlined, ptr [[THIS1]], ptr [[C_ADDR]], i64 [[TMP2]]) // CHECK3-NEXT: ret void @@ -2399,7 +2399,7 @@ struct S { // CHECK3-NEXT: store ptr [[TMP1]], ptr [[TMP7]], align 8 // CHECK3-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK3-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1 // CHECK3-NEXT: store ptr [[TMP]], ptr [[_TMP1]], align 8 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK3-NEXT: store i32 [[TMP9]], ptr [[DOTCAPTURE_EXPR_2]], align 4 @@ -2408,7 +2408,7 @@ struct S { // CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 // CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 // CHECK3-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[TOBOOL]], i32 2, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[LOADEDV]], i32 2, i32 0 // CHECK3-NEXT: [[TMP12:%.*]] = or i32 [[TMP11]], 1 // CHECK3-NEXT: [[TMP13:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP3]], i32 [[TMP12]], i64 80, i64 16, ptr @.omp_task_entry..8) // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], ptr [[TMP13]], i32 0, i32 0 @@ -2482,54 +2482,54 @@ struct S { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META58:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META62:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !64 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias !64 -// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !64 -// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !64 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META64:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store i64 [[TMP9]], ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store i64 [[TMP11]], ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store i64 [[TMP13]], ptr [[DOTST__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTLITER__ADDR_I]], align 4, !noalias [[META64]] +// CHECK3-NEXT: store ptr [[TMP17]], ptr [[DOTREDUCTIONS__ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META64]] // CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8 -// CHECK3-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias !64 +// CHECK3-NEXT: store ptr [[TMP_I]], ptr [[TMP1_I]], align 8, !noalias [[META64]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANON_4:%.*]], ptr [[TMP18]], i32 0, i32 1 // CHECK3-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 8 // CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 -// CHECK3-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !64 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !64 +// CHECK3-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META64]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META64]] // CHECK3-NEXT: [[SUB3_I:%.*]] = sub nsw i32 [[TMP23]], 1 -// CHECK3-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias !64 -// CHECK3-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias !64 -// CHECK3-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias !64 +// CHECK3-NEXT: store i32 [[SUB3_I]], ptr [[DOTCAPTURE_EXPR_2_I]], align 4, !noalias [[META64]] +// CHECK3-NEXT: store ptr [[A_I]], ptr [[TMP4_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP4_I]], align 8, !noalias [[META64]] // CHECK3-NEXT: store i32 0, ptr [[TMP24]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias !64 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__I]], align 4, !noalias [[META64]] // CHECK3-NEXT: [[CMP_I:%.*]] = icmp slt i32 0, [[TMP25]] // CHECK3-NEXT: br i1 [[CMP_I]], label [[TASKLOOP_IF_THEN_I:%.*]], label [[DOTOMP_OUTLINED__7_EXIT:%.*]] // CHECK3: taskloop.if.then.i: -// CHECK3-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias !64 -// CHECK3-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias !64 +// CHECK3-NEXT: store ptr [[A5_I]], ptr [[TMP6_I]], align 8, !noalias [[META64]] +// CHECK3-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTLB__ADDR_I]], align 8, !noalias [[META64]] // CHECK3-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP26]] to i32 -// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !64 +// CHECK3-NEXT: store i32 [[CONV_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META64]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_ANON_4]], ptr [[TMP18]], i32 0, i32 1 // CHECK3-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP27]], align 8 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I:%.*]] // CHECK3: omp.inner.for.cond.i: -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !64, !llvm.access.group [[ACC_GRP65:![0-9]+]] +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META64]], !llvm.access.group [[ACC_GRP65:![0-9]+]] // CHECK3-NEXT: [[CONV7_I:%.*]] = sext i32 [[TMP29]] to i64 -// CHECK3-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias !64, !llvm.access.group [[ACC_GRP65]] +// CHECK3-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTUB__ADDR_I]], align 8, !noalias [[META64]], !llvm.access.group [[ACC_GRP65]] // CHECK3-NEXT: [[CMP8_I:%.*]] = icmp ule i64 [[CONV7_I]], [[TMP30]] // CHECK3-NEXT: br i1 [[CMP8_I]], label [[OMP_INNER_FOR_BODY_I:%.*]], label [[OMP_INNER_FOR_END_I:%.*]] // CHECK3: omp.inner.for.body.i: -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !64, !llvm.access.group [[ACC_GRP65]] -// CHECK3-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias !64, !llvm.access.group [[ACC_GRP65]] +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META64]], !llvm.access.group [[ACC_GRP65]] +// CHECK3-NEXT: [[TMP32:%.*]] = load ptr, ptr [[TMP6_I]], align 8, !noalias [[META64]], !llvm.access.group [[ACC_GRP65]] // CHECK3-NEXT: store i32 [[TMP31]], ptr [[TMP32]], align 4, !llvm.access.group [[ACC_GRP65]] -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias !64, !llvm.access.group [[ACC_GRP65]] +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV_I]], align 4, !noalias [[META64]], !llvm.access.group [[ACC_GRP65]] // CHECK3-NEXT: [[ADD9_I:%.*]] = add nsw i32 [[TMP33]], 1 -// CHECK3-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias !64, !llvm.access.group [[ACC_GRP65]] +// CHECK3-NEXT: store i32 [[ADD9_I]], ptr [[DOTOMP_IV_I]], align 4, !noalias [[META64]], !llvm.access.group [[ACC_GRP65]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND_I]], !llvm.loop [[LOOP66:![0-9]+]] // CHECK3: omp.inner.for.end.i: // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT]] @@ -2641,8 +2641,8 @@ struct S { // CHECK5-NEXT: store i32 10, ptr [[I9]], align 4 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP12]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_21]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_21]], align 1 // CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK5-NEXT: store i32 [[TMP13]], ptr [[DOTCAPTURE_EXPR_22]], align 4 // CHECK5-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2815,8 +2815,8 @@ struct S { // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store ptr [[TMP]], ptr [[_TMP2]], align 8 // CHECK5-NEXT: [[TMP1:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK5-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_3]], align 4 @@ -2980,8 +2980,8 @@ struct S { // CHECK6-NEXT: store i32 10, ptr [[I9]], align 4 // CHECK6-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP12]], 0 -// CHECK6-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK6-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_21]], align 1 +// CHECK6-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK6-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_21]], align 1 // CHECK6-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK6-NEXT: store i32 [[TMP13]], ptr [[DOTCAPTURE_EXPR_22]], align 4 // CHECK6-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3154,8 +3154,8 @@ struct S { // CHECK6-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK6-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK6-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK6-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK6-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK6-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK6-NEXT: store ptr [[TMP]], ptr [[_TMP2]], align 8 // CHECK6-NEXT: [[TMP1:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK6-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_3]], align 4 @@ -3319,8 +3319,8 @@ struct S { // CHECK7-NEXT: store i32 10, ptr [[I9]], align 4 // CHECK7-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP12]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_21]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_21]], align 1 // CHECK7-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK7-NEXT: store i32 [[TMP13]], ptr [[DOTCAPTURE_EXPR_22]], align 4 // CHECK7-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3372,143 +3372,143 @@ struct S { // CHECK7-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 // CHECK7-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP30]], i64 8) ] // CHECK7-NEXT: [[TMP31:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_21]], align 1 -// CHECK7-NEXT: [[TOBOOL48:%.*]] = trunc i8 [[TMP31]] to i1 -// CHECK7-NEXT: br i1 [[TOBOOL48]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK7-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP31]] to i1 +// CHECK7-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK7: omp_if.then: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND49:%.*]] -// CHECK7: omp.inner.for.cond49: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND48:%.*]] +// CHECK7: omp.inner.for.cond48: // CHECK7-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9:![0-9]+]] // CHECK7-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_UB41]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[CMP50:%.*]] = icmp ule i64 [[TMP32]], [[TMP33]] -// CHECK7-NEXT: br i1 [[CMP50]], label [[OMP_INNER_FOR_BODY51:%.*]], label [[OMP_INNER_FOR_END84:%.*]] -// CHECK7: omp.inner.for.body51: +// CHECK7-NEXT: [[CMP49:%.*]] = icmp ule i64 [[TMP32]], [[TMP33]] +// CHECK7-NEXT: br i1 [[CMP49]], label [[OMP_INNER_FOR_BODY50:%.*]], label [[OMP_INNER_FOR_END83:%.*]] +// CHECK7: omp.inner.for.body50: // CHECK7-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[SUB52:%.*]] = sub i32 [[TMP35]], [[TMP36]] -// CHECK7-NEXT: [[SUB53:%.*]] = sub i32 [[SUB52]], 1 -// CHECK7-NEXT: [[ADD54:%.*]] = add i32 [[SUB53]], 1 -// CHECK7-NEXT: [[DIV55:%.*]] = udiv i32 [[ADD54]], 1 -// CHECK7-NEXT: [[MUL56:%.*]] = mul i32 1, [[DIV55]] -// CHECK7-NEXT: [[CONV57:%.*]] = zext i32 [[MUL56]] to i64 -// CHECK7-NEXT: [[DIV58:%.*]] = sdiv i64 [[TMP34]], [[CONV57]] -// CHECK7-NEXT: [[MUL59:%.*]] = mul nsw i64 [[DIV58]], 1 -// CHECK7-NEXT: [[ADD60:%.*]] = add nsw i64 0, [[MUL59]] -// CHECK7-NEXT: [[CONV61:%.*]] = trunc i64 [[ADD60]] to i32 -// CHECK7-NEXT: store i32 [[CONV61]], ptr [[I46]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK7-NEXT: [[SUB51:%.*]] = sub i32 [[TMP35]], [[TMP36]] +// CHECK7-NEXT: [[SUB52:%.*]] = sub i32 [[SUB51]], 1 +// CHECK7-NEXT: [[ADD53:%.*]] = add i32 [[SUB52]], 1 +// CHECK7-NEXT: [[DIV54:%.*]] = udiv i32 [[ADD53]], 1 +// CHECK7-NEXT: [[MUL55:%.*]] = mul i32 1, [[DIV54]] +// CHECK7-NEXT: [[CONV56:%.*]] = zext i32 [[MUL55]] to i64 +// CHECK7-NEXT: [[DIV57:%.*]] = sdiv i64 [[TMP34]], [[CONV56]] +// CHECK7-NEXT: [[MUL58:%.*]] = mul nsw i64 [[DIV57]], 1 +// CHECK7-NEXT: [[ADD59:%.*]] = add nsw i64 0, [[MUL58]] +// CHECK7-NEXT: [[CONV60:%.*]] = trunc i64 [[ADD59]] to i32 +// CHECK7-NEXT: store i32 [[CONV60]], ptr [[I46]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[CONV62:%.*]] = sext i32 [[TMP37]] to i64 +// CHECK7-NEXT: [[CONV61:%.*]] = sext i32 [[TMP37]] to i64 // CHECK7-NEXT: [[TMP38:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[SUB63:%.*]] = sub i32 [[TMP40]], [[TMP41]] -// CHECK7-NEXT: [[SUB64:%.*]] = sub i32 [[SUB63]], 1 -// CHECK7-NEXT: [[ADD65:%.*]] = add i32 [[SUB64]], 1 -// CHECK7-NEXT: [[DIV66:%.*]] = udiv i32 [[ADD65]], 1 -// CHECK7-NEXT: [[MUL67:%.*]] = mul i32 1, [[DIV66]] -// CHECK7-NEXT: [[CONV68:%.*]] = zext i32 [[MUL67]] to i64 -// CHECK7-NEXT: [[DIV69:%.*]] = sdiv i64 [[TMP39]], [[CONV68]] +// CHECK7-NEXT: [[SUB62:%.*]] = sub i32 [[TMP40]], [[TMP41]] +// CHECK7-NEXT: [[SUB63:%.*]] = sub i32 [[SUB62]], 1 +// CHECK7-NEXT: [[ADD64:%.*]] = add i32 [[SUB63]], 1 +// CHECK7-NEXT: [[DIV65:%.*]] = udiv i32 [[ADD64]], 1 +// CHECK7-NEXT: [[MUL66:%.*]] = mul i32 1, [[DIV65]] +// CHECK7-NEXT: [[CONV67:%.*]] = zext i32 [[MUL66]] to i64 +// CHECK7-NEXT: [[DIV68:%.*]] = sdiv i64 [[TMP39]], [[CONV67]] // CHECK7-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK7-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[SUB70:%.*]] = sub i32 [[TMP42]], [[TMP43]] -// CHECK7-NEXT: [[SUB71:%.*]] = sub i32 [[SUB70]], 1 -// CHECK7-NEXT: [[ADD72:%.*]] = add i32 [[SUB71]], 1 -// CHECK7-NEXT: [[DIV73:%.*]] = udiv i32 [[ADD72]], 1 -// CHECK7-NEXT: [[MUL74:%.*]] = mul i32 1, [[DIV73]] -// CHECK7-NEXT: [[CONV75:%.*]] = zext i32 [[MUL74]] to i64 -// CHECK7-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV69]], [[CONV75]] -// CHECK7-NEXT: [[SUB77:%.*]] = sub nsw i64 [[TMP38]], [[MUL76]] -// CHECK7-NEXT: [[MUL78:%.*]] = mul nsw i64 [[SUB77]], 1 -// CHECK7-NEXT: [[ADD79:%.*]] = add nsw i64 [[CONV62]], [[MUL78]] -// CHECK7-NEXT: [[CONV80:%.*]] = trunc i64 [[ADD79]] to i32 -// CHECK7-NEXT: store i32 [[CONV80]], ptr [[J47]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE81:%.*]] -// CHECK7: omp.body.continue81: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC82:%.*]] -// CHECK7: omp.inner.for.inc82: +// CHECK7-NEXT: [[SUB69:%.*]] = sub i32 [[TMP42]], [[TMP43]] +// CHECK7-NEXT: [[SUB70:%.*]] = sub i32 [[SUB69]], 1 +// CHECK7-NEXT: [[ADD71:%.*]] = add i32 [[SUB70]], 1 +// CHECK7-NEXT: [[DIV72:%.*]] = udiv i32 [[ADD71]], 1 +// CHECK7-NEXT: [[MUL73:%.*]] = mul i32 1, [[DIV72]] +// CHECK7-NEXT: [[CONV74:%.*]] = zext i32 [[MUL73]] to i64 +// CHECK7-NEXT: [[MUL75:%.*]] = mul nsw i64 [[DIV68]], [[CONV74]] +// CHECK7-NEXT: [[SUB76:%.*]] = sub nsw i64 [[TMP38]], [[MUL75]] +// CHECK7-NEXT: [[MUL77:%.*]] = mul nsw i64 [[SUB76]], 1 +// CHECK7-NEXT: [[ADD78:%.*]] = add nsw i64 [[CONV61]], [[MUL77]] +// CHECK7-NEXT: [[CONV79:%.*]] = trunc i64 [[ADD78]] to i32 +// CHECK7-NEXT: store i32 [[CONV79]], ptr [[J47]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE80:%.*]] +// CHECK7: omp.body.continue80: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC81:%.*]] +// CHECK7: omp.inner.for.inc81: // CHECK7-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: [[ADD83:%.*]] = add nsw i64 [[TMP44]], 1 -// CHECK7-NEXT: store i64 [[ADD83]], ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND49]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK7: omp.inner.for.end84: +// CHECK7-NEXT: [[ADD82:%.*]] = add nsw i64 [[TMP44]], 1 +// CHECK7-NEXT: store i64 [[ADD82]], ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND48]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK7: omp.inner.for.end83: // CHECK7-NEXT: br label [[OMP_IF_END:%.*]] // CHECK7: omp_if.else: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND85:%.*]] -// CHECK7: omp.inner.for.cond85: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND84:%.*]] +// CHECK7: omp.inner.for.cond84: // CHECK7-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK7-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB41]], align 8 -// CHECK7-NEXT: [[CMP86:%.*]] = icmp ule i64 [[TMP45]], [[TMP46]] -// CHECK7-NEXT: br i1 [[CMP86]], label [[OMP_INNER_FOR_BODY87:%.*]], label [[OMP_INNER_FOR_END120:%.*]] -// CHECK7: omp.inner.for.body87: +// CHECK7-NEXT: [[CMP85:%.*]] = icmp ule i64 [[TMP45]], [[TMP46]] +// CHECK7-NEXT: br i1 [[CMP85]], label [[OMP_INNER_FOR_BODY86:%.*]], label [[OMP_INNER_FOR_END119:%.*]] +// CHECK7: omp.inner.for.body86: // CHECK7-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK7-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK7-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK7-NEXT: [[SUB88:%.*]] = sub i32 [[TMP48]], [[TMP49]] -// CHECK7-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 -// CHECK7-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 -// CHECK7-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 -// CHECK7-NEXT: [[MUL92:%.*]] = mul i32 1, [[DIV91]] -// CHECK7-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 -// CHECK7-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP47]], [[CONV93]] -// CHECK7-NEXT: [[MUL95:%.*]] = mul nsw i64 [[DIV94]], 1 -// CHECK7-NEXT: [[ADD96:%.*]] = add nsw i64 0, [[MUL95]] -// CHECK7-NEXT: [[CONV97:%.*]] = trunc i64 [[ADD96]] to i32 -// CHECK7-NEXT: store i32 [[CONV97]], ptr [[I46]], align 4 +// CHECK7-NEXT: [[SUB87:%.*]] = sub i32 [[TMP48]], [[TMP49]] +// CHECK7-NEXT: [[SUB88:%.*]] = sub i32 [[SUB87]], 1 +// CHECK7-NEXT: [[ADD89:%.*]] = add i32 [[SUB88]], 1 +// CHECK7-NEXT: [[DIV90:%.*]] = udiv i32 [[ADD89]], 1 +// CHECK7-NEXT: [[MUL91:%.*]] = mul i32 1, [[DIV90]] +// CHECK7-NEXT: [[CONV92:%.*]] = zext i32 [[MUL91]] to i64 +// CHECK7-NEXT: [[DIV93:%.*]] = sdiv i64 [[TMP47]], [[CONV92]] +// CHECK7-NEXT: [[MUL94:%.*]] = mul nsw i64 [[DIV93]], 1 +// CHECK7-NEXT: [[ADD95:%.*]] = add nsw i64 0, [[MUL94]] +// CHECK7-NEXT: [[CONV96:%.*]] = trunc i64 [[ADD95]] to i32 +// CHECK7-NEXT: store i32 [[CONV96]], ptr [[I46]], align 4 // CHECK7-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK7-NEXT: [[CONV98:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK7-NEXT: [[CONV97:%.*]] = sext i32 [[TMP50]] to i64 // CHECK7-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK7-NEXT: [[TMP52:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK7-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK7-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK7-NEXT: [[SUB99:%.*]] = sub i32 [[TMP53]], [[TMP54]] -// CHECK7-NEXT: [[SUB100:%.*]] = sub i32 [[SUB99]], 1 -// CHECK7-NEXT: [[ADD101:%.*]] = add i32 [[SUB100]], 1 -// CHECK7-NEXT: [[DIV102:%.*]] = udiv i32 [[ADD101]], 1 -// CHECK7-NEXT: [[MUL103:%.*]] = mul i32 1, [[DIV102]] -// CHECK7-NEXT: [[CONV104:%.*]] = zext i32 [[MUL103]] to i64 -// CHECK7-NEXT: [[DIV105:%.*]] = sdiv i64 [[TMP52]], [[CONV104]] +// CHECK7-NEXT: [[SUB98:%.*]] = sub i32 [[TMP53]], [[TMP54]] +// CHECK7-NEXT: [[SUB99:%.*]] = sub i32 [[SUB98]], 1 +// CHECK7-NEXT: [[ADD100:%.*]] = add i32 [[SUB99]], 1 +// CHECK7-NEXT: [[DIV101:%.*]] = udiv i32 [[ADD100]], 1 +// CHECK7-NEXT: [[MUL102:%.*]] = mul i32 1, [[DIV101]] +// CHECK7-NEXT: [[CONV103:%.*]] = zext i32 [[MUL102]] to i64 +// CHECK7-NEXT: [[DIV104:%.*]] = sdiv i64 [[TMP52]], [[CONV103]] // CHECK7-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK7-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK7-NEXT: [[SUB106:%.*]] = sub i32 [[TMP55]], [[TMP56]] -// CHECK7-NEXT: [[SUB107:%.*]] = sub i32 [[SUB106]], 1 -// CHECK7-NEXT: [[ADD108:%.*]] = add i32 [[SUB107]], 1 -// CHECK7-NEXT: [[DIV109:%.*]] = udiv i32 [[ADD108]], 1 -// CHECK7-NEXT: [[MUL110:%.*]] = mul i32 1, [[DIV109]] -// CHECK7-NEXT: [[CONV111:%.*]] = zext i32 [[MUL110]] to i64 -// CHECK7-NEXT: [[MUL112:%.*]] = mul nsw i64 [[DIV105]], [[CONV111]] -// CHECK7-NEXT: [[SUB113:%.*]] = sub nsw i64 [[TMP51]], [[MUL112]] -// CHECK7-NEXT: [[MUL114:%.*]] = mul nsw i64 [[SUB113]], 1 -// CHECK7-NEXT: [[ADD115:%.*]] = add nsw i64 [[CONV98]], [[MUL114]] -// CHECK7-NEXT: [[CONV116:%.*]] = trunc i64 [[ADD115]] to i32 -// CHECK7-NEXT: store i32 [[CONV116]], ptr [[J47]], align 4 -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE117:%.*]] -// CHECK7: omp.body.continue117: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC118:%.*]] -// CHECK7: omp.inner.for.inc118: +// CHECK7-NEXT: [[SUB105:%.*]] = sub i32 [[TMP55]], [[TMP56]] +// CHECK7-NEXT: [[SUB106:%.*]] = sub i32 [[SUB105]], 1 +// CHECK7-NEXT: [[ADD107:%.*]] = add i32 [[SUB106]], 1 +// CHECK7-NEXT: [[DIV108:%.*]] = udiv i32 [[ADD107]], 1 +// CHECK7-NEXT: [[MUL109:%.*]] = mul i32 1, [[DIV108]] +// CHECK7-NEXT: [[CONV110:%.*]] = zext i32 [[MUL109]] to i64 +// CHECK7-NEXT: [[MUL111:%.*]] = mul nsw i64 [[DIV104]], [[CONV110]] +// CHECK7-NEXT: [[SUB112:%.*]] = sub nsw i64 [[TMP51]], [[MUL111]] +// CHECK7-NEXT: [[MUL113:%.*]] = mul nsw i64 [[SUB112]], 1 +// CHECK7-NEXT: [[ADD114:%.*]] = add nsw i64 [[CONV97]], [[MUL113]] +// CHECK7-NEXT: [[CONV115:%.*]] = trunc i64 [[ADD114]] to i32 +// CHECK7-NEXT: store i32 [[CONV115]], ptr [[J47]], align 4 +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE116:%.*]] +// CHECK7: omp.body.continue116: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC117:%.*]] +// CHECK7: omp.inner.for.inc117: // CHECK7-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 -// CHECK7-NEXT: [[ADD119:%.*]] = add nsw i64 [[TMP57]], 1 -// CHECK7-NEXT: store i64 [[ADD119]], ptr [[DOTOMP_IV45]], align 8 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND85]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK7: omp.inner.for.end120: +// CHECK7-NEXT: [[ADD118:%.*]] = add nsw i64 [[TMP57]], 1 +// CHECK7-NEXT: store i64 [[ADD118]], ptr [[DOTOMP_IV45]], align 8 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND84]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK7: omp.inner.for.end119: // CHECK7-NEXT: br label [[OMP_IF_END]] // CHECK7: omp_if.end: // CHECK7-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_25]], align 4 -// CHECK7-NEXT: [[SUB121:%.*]] = sub nsw i32 [[TMP58]], 0 -// CHECK7-NEXT: [[DIV122:%.*]] = sdiv i32 [[SUB121]], 1 -// CHECK7-NEXT: [[MUL123:%.*]] = mul nsw i32 [[DIV122]], 1 -// CHECK7-NEXT: [[ADD124:%.*]] = add nsw i32 0, [[MUL123]] -// CHECK7-NEXT: store i32 [[ADD124]], ptr [[I20]], align 4 +// CHECK7-NEXT: [[SUB120:%.*]] = sub nsw i32 [[TMP58]], 0 +// CHECK7-NEXT: [[DIV121:%.*]] = sdiv i32 [[SUB120]], 1 +// CHECK7-NEXT: [[MUL122:%.*]] = mul nsw i32 [[DIV121]], 1 +// CHECK7-NEXT: [[ADD123:%.*]] = add nsw i32 0, [[MUL122]] +// CHECK7-NEXT: store i32 [[ADD123]], ptr [[I20]], align 4 // CHECK7-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 // CHECK7-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK7-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK7-NEXT: [[SUB125:%.*]] = sub i32 [[TMP60]], [[TMP61]] -// CHECK7-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 -// CHECK7-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], 1 -// CHECK7-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], 1 -// CHECK7-NEXT: [[MUL129:%.*]] = mul i32 [[DIV128]], 1 -// CHECK7-NEXT: [[ADD130:%.*]] = add i32 [[TMP59]], [[MUL129]] -// CHECK7-NEXT: store i32 [[ADD130]], ptr [[J47]], align 4 +// CHECK7-NEXT: [[SUB124:%.*]] = sub i32 [[TMP60]], [[TMP61]] +// CHECK7-NEXT: [[SUB125:%.*]] = sub i32 [[SUB124]], 1 +// CHECK7-NEXT: [[ADD126:%.*]] = add i32 [[SUB125]], 1 +// CHECK7-NEXT: [[DIV127:%.*]] = udiv i32 [[ADD126]], 1 +// CHECK7-NEXT: [[MUL128:%.*]] = mul i32 [[DIV127]], 1 +// CHECK7-NEXT: [[ADD129:%.*]] = add i32 [[TMP59]], [[MUL128]] +// CHECK7-NEXT: store i32 [[ADD129]], ptr [[J47]], align 4 // CHECK7-NEXT: br label [[SIMD_IF_END]] // CHECK7: simd.if.end: // CHECK7-NEXT: [[TMP62:%.*]] = load i32, ptr [[RETVAL]], align 4 @@ -3558,8 +3558,8 @@ struct S { // CHECK7-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK7-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store ptr [[TMP]], ptr [[_TMP2]], align 8 // CHECK7-NEXT: [[TMP1:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK7-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_3]], align 4 @@ -3723,8 +3723,8 @@ struct S { // CHECK8-NEXT: store i32 10, ptr [[I9]], align 4 // CHECK8-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK8-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP12]], 0 -// CHECK8-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK8-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_21]], align 1 +// CHECK8-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK8-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_21]], align 1 // CHECK8-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK8-NEXT: store i32 [[TMP13]], ptr [[DOTCAPTURE_EXPR_22]], align 4 // CHECK8-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3776,143 +3776,143 @@ struct S { // CHECK8-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8 // CHECK8-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[TMP30]], i64 8) ] // CHECK8-NEXT: [[TMP31:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_21]], align 1 -// CHECK8-NEXT: [[TOBOOL48:%.*]] = trunc i8 [[TMP31]] to i1 -// CHECK8-NEXT: br i1 [[TOBOOL48]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK8-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP31]] to i1 +// CHECK8-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK8: omp_if.then: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND49:%.*]] -// CHECK8: omp.inner.for.cond49: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND48:%.*]] +// CHECK8: omp.inner.for.cond48: // CHECK8-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9:![0-9]+]] // CHECK8-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_UB41]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[CMP50:%.*]] = icmp ule i64 [[TMP32]], [[TMP33]] -// CHECK8-NEXT: br i1 [[CMP50]], label [[OMP_INNER_FOR_BODY51:%.*]], label [[OMP_INNER_FOR_END84:%.*]] -// CHECK8: omp.inner.for.body51: +// CHECK8-NEXT: [[CMP49:%.*]] = icmp ule i64 [[TMP32]], [[TMP33]] +// CHECK8-NEXT: br i1 [[CMP49]], label [[OMP_INNER_FOR_BODY50:%.*]], label [[OMP_INNER_FOR_END83:%.*]] +// CHECK8: omp.inner.for.body50: // CHECK8-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[SUB52:%.*]] = sub i32 [[TMP35]], [[TMP36]] -// CHECK8-NEXT: [[SUB53:%.*]] = sub i32 [[SUB52]], 1 -// CHECK8-NEXT: [[ADD54:%.*]] = add i32 [[SUB53]], 1 -// CHECK8-NEXT: [[DIV55:%.*]] = udiv i32 [[ADD54]], 1 -// CHECK8-NEXT: [[MUL56:%.*]] = mul i32 1, [[DIV55]] -// CHECK8-NEXT: [[CONV57:%.*]] = zext i32 [[MUL56]] to i64 -// CHECK8-NEXT: [[DIV58:%.*]] = sdiv i64 [[TMP34]], [[CONV57]] -// CHECK8-NEXT: [[MUL59:%.*]] = mul nsw i64 [[DIV58]], 1 -// CHECK8-NEXT: [[ADD60:%.*]] = add nsw i64 0, [[MUL59]] -// CHECK8-NEXT: [[CONV61:%.*]] = trunc i64 [[ADD60]] to i32 -// CHECK8-NEXT: store i32 [[CONV61]], ptr [[I46]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK8-NEXT: [[SUB51:%.*]] = sub i32 [[TMP35]], [[TMP36]] +// CHECK8-NEXT: [[SUB52:%.*]] = sub i32 [[SUB51]], 1 +// CHECK8-NEXT: [[ADD53:%.*]] = add i32 [[SUB52]], 1 +// CHECK8-NEXT: [[DIV54:%.*]] = udiv i32 [[ADD53]], 1 +// CHECK8-NEXT: [[MUL55:%.*]] = mul i32 1, [[DIV54]] +// CHECK8-NEXT: [[CONV56:%.*]] = zext i32 [[MUL55]] to i64 +// CHECK8-NEXT: [[DIV57:%.*]] = sdiv i64 [[TMP34]], [[CONV56]] +// CHECK8-NEXT: [[MUL58:%.*]] = mul nsw i64 [[DIV57]], 1 +// CHECK8-NEXT: [[ADD59:%.*]] = add nsw i64 0, [[MUL58]] +// CHECK8-NEXT: [[CONV60:%.*]] = trunc i64 [[ADD59]] to i32 +// CHECK8-NEXT: store i32 [[CONV60]], ptr [[I46]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[CONV62:%.*]] = sext i32 [[TMP37]] to i64 +// CHECK8-NEXT: [[CONV61:%.*]] = sext i32 [[TMP37]] to i64 // CHECK8-NEXT: [[TMP38:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[SUB63:%.*]] = sub i32 [[TMP40]], [[TMP41]] -// CHECK8-NEXT: [[SUB64:%.*]] = sub i32 [[SUB63]], 1 -// CHECK8-NEXT: [[ADD65:%.*]] = add i32 [[SUB64]], 1 -// CHECK8-NEXT: [[DIV66:%.*]] = udiv i32 [[ADD65]], 1 -// CHECK8-NEXT: [[MUL67:%.*]] = mul i32 1, [[DIV66]] -// CHECK8-NEXT: [[CONV68:%.*]] = zext i32 [[MUL67]] to i64 -// CHECK8-NEXT: [[DIV69:%.*]] = sdiv i64 [[TMP39]], [[CONV68]] +// CHECK8-NEXT: [[SUB62:%.*]] = sub i32 [[TMP40]], [[TMP41]] +// CHECK8-NEXT: [[SUB63:%.*]] = sub i32 [[SUB62]], 1 +// CHECK8-NEXT: [[ADD64:%.*]] = add i32 [[SUB63]], 1 +// CHECK8-NEXT: [[DIV65:%.*]] = udiv i32 [[ADD64]], 1 +// CHECK8-NEXT: [[MUL66:%.*]] = mul i32 1, [[DIV65]] +// CHECK8-NEXT: [[CONV67:%.*]] = zext i32 [[MUL66]] to i64 +// CHECK8-NEXT: [[DIV68:%.*]] = sdiv i64 [[TMP39]], [[CONV67]] // CHECK8-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK8-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[SUB70:%.*]] = sub i32 [[TMP42]], [[TMP43]] -// CHECK8-NEXT: [[SUB71:%.*]] = sub i32 [[SUB70]], 1 -// CHECK8-NEXT: [[ADD72:%.*]] = add i32 [[SUB71]], 1 -// CHECK8-NEXT: [[DIV73:%.*]] = udiv i32 [[ADD72]], 1 -// CHECK8-NEXT: [[MUL74:%.*]] = mul i32 1, [[DIV73]] -// CHECK8-NEXT: [[CONV75:%.*]] = zext i32 [[MUL74]] to i64 -// CHECK8-NEXT: [[MUL76:%.*]] = mul nsw i64 [[DIV69]], [[CONV75]] -// CHECK8-NEXT: [[SUB77:%.*]] = sub nsw i64 [[TMP38]], [[MUL76]] -// CHECK8-NEXT: [[MUL78:%.*]] = mul nsw i64 [[SUB77]], 1 -// CHECK8-NEXT: [[ADD79:%.*]] = add nsw i64 [[CONV62]], [[MUL78]] -// CHECK8-NEXT: [[CONV80:%.*]] = trunc i64 [[ADD79]] to i32 -// CHECK8-NEXT: store i32 [[CONV80]], ptr [[J47]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE81:%.*]] -// CHECK8: omp.body.continue81: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC82:%.*]] -// CHECK8: omp.inner.for.inc82: +// CHECK8-NEXT: [[SUB69:%.*]] = sub i32 [[TMP42]], [[TMP43]] +// CHECK8-NEXT: [[SUB70:%.*]] = sub i32 [[SUB69]], 1 +// CHECK8-NEXT: [[ADD71:%.*]] = add i32 [[SUB70]], 1 +// CHECK8-NEXT: [[DIV72:%.*]] = udiv i32 [[ADD71]], 1 +// CHECK8-NEXT: [[MUL73:%.*]] = mul i32 1, [[DIV72]] +// CHECK8-NEXT: [[CONV74:%.*]] = zext i32 [[MUL73]] to i64 +// CHECK8-NEXT: [[MUL75:%.*]] = mul nsw i64 [[DIV68]], [[CONV74]] +// CHECK8-NEXT: [[SUB76:%.*]] = sub nsw i64 [[TMP38]], [[MUL75]] +// CHECK8-NEXT: [[MUL77:%.*]] = mul nsw i64 [[SUB76]], 1 +// CHECK8-NEXT: [[ADD78:%.*]] = add nsw i64 [[CONV61]], [[MUL77]] +// CHECK8-NEXT: [[CONV79:%.*]] = trunc i64 [[ADD78]] to i32 +// CHECK8-NEXT: store i32 [[CONV79]], ptr [[J47]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE80:%.*]] +// CHECK8: omp.body.continue80: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC81:%.*]] +// CHECK8: omp.inner.for.inc81: // CHECK8-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: [[ADD83:%.*]] = add nsw i64 [[TMP44]], 1 -// CHECK8-NEXT: store i64 [[ADD83]], ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND49]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK8: omp.inner.for.end84: +// CHECK8-NEXT: [[ADD82:%.*]] = add nsw i64 [[TMP44]], 1 +// CHECK8-NEXT: store i64 [[ADD82]], ptr [[DOTOMP_IV45]], align 8, !llvm.access.group [[ACC_GRP9]] +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND48]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK8: omp.inner.for.end83: // CHECK8-NEXT: br label [[OMP_IF_END:%.*]] // CHECK8: omp_if.else: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND85:%.*]] -// CHECK8: omp.inner.for.cond85: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND84:%.*]] +// CHECK8: omp.inner.for.cond84: // CHECK8-NEXT: [[TMP45:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK8-NEXT: [[TMP46:%.*]] = load i64, ptr [[DOTOMP_UB41]], align 8 -// CHECK8-NEXT: [[CMP86:%.*]] = icmp ule i64 [[TMP45]], [[TMP46]] -// CHECK8-NEXT: br i1 [[CMP86]], label [[OMP_INNER_FOR_BODY87:%.*]], label [[OMP_INNER_FOR_END120:%.*]] -// CHECK8: omp.inner.for.body87: +// CHECK8-NEXT: [[CMP85:%.*]] = icmp ule i64 [[TMP45]], [[TMP46]] +// CHECK8-NEXT: br i1 [[CMP85]], label [[OMP_INNER_FOR_BODY86:%.*]], label [[OMP_INNER_FOR_END119:%.*]] +// CHECK8: omp.inner.for.body86: // CHECK8-NEXT: [[TMP47:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK8-NEXT: [[TMP48:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK8-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK8-NEXT: [[SUB88:%.*]] = sub i32 [[TMP48]], [[TMP49]] -// CHECK8-NEXT: [[SUB89:%.*]] = sub i32 [[SUB88]], 1 -// CHECK8-NEXT: [[ADD90:%.*]] = add i32 [[SUB89]], 1 -// CHECK8-NEXT: [[DIV91:%.*]] = udiv i32 [[ADD90]], 1 -// CHECK8-NEXT: [[MUL92:%.*]] = mul i32 1, [[DIV91]] -// CHECK8-NEXT: [[CONV93:%.*]] = zext i32 [[MUL92]] to i64 -// CHECK8-NEXT: [[DIV94:%.*]] = sdiv i64 [[TMP47]], [[CONV93]] -// CHECK8-NEXT: [[MUL95:%.*]] = mul nsw i64 [[DIV94]], 1 -// CHECK8-NEXT: [[ADD96:%.*]] = add nsw i64 0, [[MUL95]] -// CHECK8-NEXT: [[CONV97:%.*]] = trunc i64 [[ADD96]] to i32 -// CHECK8-NEXT: store i32 [[CONV97]], ptr [[I46]], align 4 +// CHECK8-NEXT: [[SUB87:%.*]] = sub i32 [[TMP48]], [[TMP49]] +// CHECK8-NEXT: [[SUB88:%.*]] = sub i32 [[SUB87]], 1 +// CHECK8-NEXT: [[ADD89:%.*]] = add i32 [[SUB88]], 1 +// CHECK8-NEXT: [[DIV90:%.*]] = udiv i32 [[ADD89]], 1 +// CHECK8-NEXT: [[MUL91:%.*]] = mul i32 1, [[DIV90]] +// CHECK8-NEXT: [[CONV92:%.*]] = zext i32 [[MUL91]] to i64 +// CHECK8-NEXT: [[DIV93:%.*]] = sdiv i64 [[TMP47]], [[CONV92]] +// CHECK8-NEXT: [[MUL94:%.*]] = mul nsw i64 [[DIV93]], 1 +// CHECK8-NEXT: [[ADD95:%.*]] = add nsw i64 0, [[MUL94]] +// CHECK8-NEXT: [[CONV96:%.*]] = trunc i64 [[ADD95]] to i32 +// CHECK8-NEXT: store i32 [[CONV96]], ptr [[I46]], align 4 // CHECK8-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK8-NEXT: [[CONV98:%.*]] = sext i32 [[TMP50]] to i64 +// CHECK8-NEXT: [[CONV97:%.*]] = sext i32 [[TMP50]] to i64 // CHECK8-NEXT: [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK8-NEXT: [[TMP52:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 // CHECK8-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK8-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK8-NEXT: [[SUB99:%.*]] = sub i32 [[TMP53]], [[TMP54]] -// CHECK8-NEXT: [[SUB100:%.*]] = sub i32 [[SUB99]], 1 -// CHECK8-NEXT: [[ADD101:%.*]] = add i32 [[SUB100]], 1 -// CHECK8-NEXT: [[DIV102:%.*]] = udiv i32 [[ADD101]], 1 -// CHECK8-NEXT: [[MUL103:%.*]] = mul i32 1, [[DIV102]] -// CHECK8-NEXT: [[CONV104:%.*]] = zext i32 [[MUL103]] to i64 -// CHECK8-NEXT: [[DIV105:%.*]] = sdiv i64 [[TMP52]], [[CONV104]] +// CHECK8-NEXT: [[SUB98:%.*]] = sub i32 [[TMP53]], [[TMP54]] +// CHECK8-NEXT: [[SUB99:%.*]] = sub i32 [[SUB98]], 1 +// CHECK8-NEXT: [[ADD100:%.*]] = add i32 [[SUB99]], 1 +// CHECK8-NEXT: [[DIV101:%.*]] = udiv i32 [[ADD100]], 1 +// CHECK8-NEXT: [[MUL102:%.*]] = mul i32 1, [[DIV101]] +// CHECK8-NEXT: [[CONV103:%.*]] = zext i32 [[MUL102]] to i64 +// CHECK8-NEXT: [[DIV104:%.*]] = sdiv i64 [[TMP52]], [[CONV103]] // CHECK8-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK8-NEXT: [[TMP56:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK8-NEXT: [[SUB106:%.*]] = sub i32 [[TMP55]], [[TMP56]] -// CHECK8-NEXT: [[SUB107:%.*]] = sub i32 [[SUB106]], 1 -// CHECK8-NEXT: [[ADD108:%.*]] = add i32 [[SUB107]], 1 -// CHECK8-NEXT: [[DIV109:%.*]] = udiv i32 [[ADD108]], 1 -// CHECK8-NEXT: [[MUL110:%.*]] = mul i32 1, [[DIV109]] -// CHECK8-NEXT: [[CONV111:%.*]] = zext i32 [[MUL110]] to i64 -// CHECK8-NEXT: [[MUL112:%.*]] = mul nsw i64 [[DIV105]], [[CONV111]] -// CHECK8-NEXT: [[SUB113:%.*]] = sub nsw i64 [[TMP51]], [[MUL112]] -// CHECK8-NEXT: [[MUL114:%.*]] = mul nsw i64 [[SUB113]], 1 -// CHECK8-NEXT: [[ADD115:%.*]] = add nsw i64 [[CONV98]], [[MUL114]] -// CHECK8-NEXT: [[CONV116:%.*]] = trunc i64 [[ADD115]] to i32 -// CHECK8-NEXT: store i32 [[CONV116]], ptr [[J47]], align 4 -// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE117:%.*]] -// CHECK8: omp.body.continue117: -// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC118:%.*]] -// CHECK8: omp.inner.for.inc118: +// CHECK8-NEXT: [[SUB105:%.*]] = sub i32 [[TMP55]], [[TMP56]] +// CHECK8-NEXT: [[SUB106:%.*]] = sub i32 [[SUB105]], 1 +// CHECK8-NEXT: [[ADD107:%.*]] = add i32 [[SUB106]], 1 +// CHECK8-NEXT: [[DIV108:%.*]] = udiv i32 [[ADD107]], 1 +// CHECK8-NEXT: [[MUL109:%.*]] = mul i32 1, [[DIV108]] +// CHECK8-NEXT: [[CONV110:%.*]] = zext i32 [[MUL109]] to i64 +// CHECK8-NEXT: [[MUL111:%.*]] = mul nsw i64 [[DIV104]], [[CONV110]] +// CHECK8-NEXT: [[SUB112:%.*]] = sub nsw i64 [[TMP51]], [[MUL111]] +// CHECK8-NEXT: [[MUL113:%.*]] = mul nsw i64 [[SUB112]], 1 +// CHECK8-NEXT: [[ADD114:%.*]] = add nsw i64 [[CONV97]], [[MUL113]] +// CHECK8-NEXT: [[CONV115:%.*]] = trunc i64 [[ADD114]] to i32 +// CHECK8-NEXT: store i32 [[CONV115]], ptr [[J47]], align 4 +// CHECK8-NEXT: br label [[OMP_BODY_CONTINUE116:%.*]] +// CHECK8: omp.body.continue116: +// CHECK8-NEXT: br label [[OMP_INNER_FOR_INC117:%.*]] +// CHECK8: omp.inner.for.inc117: // CHECK8-NEXT: [[TMP57:%.*]] = load i64, ptr [[DOTOMP_IV45]], align 8 -// CHECK8-NEXT: [[ADD119:%.*]] = add nsw i64 [[TMP57]], 1 -// CHECK8-NEXT: store i64 [[ADD119]], ptr [[DOTOMP_IV45]], align 8 -// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND85]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK8: omp.inner.for.end120: +// CHECK8-NEXT: [[ADD118:%.*]] = add nsw i64 [[TMP57]], 1 +// CHECK8-NEXT: store i64 [[ADD118]], ptr [[DOTOMP_IV45]], align 8 +// CHECK8-NEXT: br label [[OMP_INNER_FOR_COND84]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK8: omp.inner.for.end119: // CHECK8-NEXT: br label [[OMP_IF_END]] // CHECK8: omp_if.end: // CHECK8-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_25]], align 4 -// CHECK8-NEXT: [[SUB121:%.*]] = sub nsw i32 [[TMP58]], 0 -// CHECK8-NEXT: [[DIV122:%.*]] = sdiv i32 [[SUB121]], 1 -// CHECK8-NEXT: [[MUL123:%.*]] = mul nsw i32 [[DIV122]], 1 -// CHECK8-NEXT: [[ADD124:%.*]] = add nsw i32 0, [[MUL123]] -// CHECK8-NEXT: store i32 [[ADD124]], ptr [[I20]], align 4 +// CHECK8-NEXT: [[SUB120:%.*]] = sub nsw i32 [[TMP58]], 0 +// CHECK8-NEXT: [[DIV121:%.*]] = sdiv i32 [[SUB120]], 1 +// CHECK8-NEXT: [[MUL122:%.*]] = mul nsw i32 [[DIV121]], 1 +// CHECK8-NEXT: [[ADD123:%.*]] = add nsw i32 0, [[MUL122]] +// CHECK8-NEXT: store i32 [[ADD123]], ptr [[I20]], align 4 // CHECK8-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 // CHECK8-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_27]], align 4 // CHECK8-NEXT: [[TMP61:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_26]], align 4 -// CHECK8-NEXT: [[SUB125:%.*]] = sub i32 [[TMP60]], [[TMP61]] -// CHECK8-NEXT: [[SUB126:%.*]] = sub i32 [[SUB125]], 1 -// CHECK8-NEXT: [[ADD127:%.*]] = add i32 [[SUB126]], 1 -// CHECK8-NEXT: [[DIV128:%.*]] = udiv i32 [[ADD127]], 1 -// CHECK8-NEXT: [[MUL129:%.*]] = mul i32 [[DIV128]], 1 -// CHECK8-NEXT: [[ADD130:%.*]] = add i32 [[TMP59]], [[MUL129]] -// CHECK8-NEXT: store i32 [[ADD130]], ptr [[J47]], align 4 +// CHECK8-NEXT: [[SUB124:%.*]] = sub i32 [[TMP60]], [[TMP61]] +// CHECK8-NEXT: [[SUB125:%.*]] = sub i32 [[SUB124]], 1 +// CHECK8-NEXT: [[ADD126:%.*]] = add i32 [[SUB125]], 1 +// CHECK8-NEXT: [[DIV127:%.*]] = udiv i32 [[ADD126]], 1 +// CHECK8-NEXT: [[MUL128:%.*]] = mul i32 [[DIV127]], 1 +// CHECK8-NEXT: [[ADD129:%.*]] = add i32 [[TMP59]], [[MUL128]] +// CHECK8-NEXT: store i32 [[ADD129]], ptr [[J47]], align 4 // CHECK8-NEXT: br label [[SIMD_IF_END]] // CHECK8: simd.if.end: // CHECK8-NEXT: [[TMP62:%.*]] = load i32, ptr [[RETVAL]], align 4 @@ -3962,8 +3962,8 @@ struct S { // CHECK8-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK8-NEXT: [[TMP0:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK8-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK8-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK8-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK8-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK8-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK8-NEXT: store ptr [[TMP]], ptr [[_TMP2]], align 8 // CHECK8-NEXT: [[TMP1:%.*]] = load i32, ptr [[C_ADDR]], align 4 // CHECK8-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_3]], align 4 diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp index 9be3ca8fd7587c..a9f28981d63936 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp @@ -472,8 +472,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -511,16 +511,16 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -531,42 +531,42 @@ int main() { // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.region_id, ptr [[KERNEL_ARGS5]]) // CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK1: omp_offload.failed7: +// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK1: omp_offload.failed6: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK1: omp_offload.cont8: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK1: omp_offload.cont7: // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] // CHECK1: omp_if.else: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]] @@ -865,9 +865,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void @@ -922,8 +922,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]) // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -1031,8 +1031,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -1070,12 +1070,12 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8 @@ -1086,42 +1086,42 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1 -// CHECK1-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP23]] to i1 +// CHECK1-NEXT: [[TMP24:%.*]] = select i1 [[LOADEDV2]], i32 0, i32 1 // CHECK1-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP26]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP38]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.region_id, ptr [[KERNEL_ARGS4]]) // CHECK1-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK1-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK1: omp_offload.failed6: +// CHECK1-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK1: omp_offload.failed5: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK1: omp_offload.cont7: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK1: omp_offload.cont6: // CHECK1-NEXT: ret i32 0 // // @@ -1413,9 +1413,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void @@ -1470,8 +1470,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]) // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp index 71f4506fb6348e..48764b27ed6fa8 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp @@ -532,8 +532,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -571,16 +571,16 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -591,42 +591,42 @@ int main() { // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS5]]) // CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK1: omp_offload.failed7: +// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK1: omp_offload.failed6: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK1: omp_offload.cont8: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK1: omp_offload.cont7: // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] // CHECK1: omp_if.else: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] @@ -953,9 +953,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void @@ -1010,8 +1010,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP36]] // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP36]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP36]] // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -1133,8 +1133,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -1172,12 +1172,12 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l66() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8 @@ -1188,42 +1188,42 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1 -// CHECK1-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP23]] to i1 +// CHECK1-NEXT: [[TMP24:%.*]] = select i1 [[LOADEDV2]], i32 0, i32 1 // CHECK1-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP26]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.3, ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP38]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS4]]) // CHECK1-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK1-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK1: omp_offload.failed6: +// CHECK1-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK1: omp_offload.failed5: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK1: omp_offload.cont7: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK1: omp_offload.cont6: // CHECK1-NEXT: ret i32 0 // // @@ -1543,9 +1543,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void @@ -1600,8 +1600,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP54]] // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP54]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP54]] // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -2137,8 +2137,8 @@ int main() { // CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK3-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -2176,16 +2176,16 @@ int main() { // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR2]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK3-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK3-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -2196,42 +2196,42 @@ int main() { // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK3-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK3-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK3-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK3-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK3-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK3-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK3-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK3-NEXT: store ptr @.offload_sizes.2, ptr [[TMP31]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK3-NEXT: store ptr @.offload_maptypes.3, ptr [[TMP32]], align 8 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK3-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK3-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK3-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK3-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK3-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS5]]) // CHECK3-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK3: omp_offload.failed7: +// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK3: omp_offload.failed6: // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK3: omp_offload.cont8: +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK3: omp_offload.cont7: // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] // CHECK3: omp_if.else: // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] @@ -2558,9 +2558,9 @@ int main() { // CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK3-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP1]]) // CHECK3-NEXT: ret void @@ -2607,8 +2607,8 @@ int main() { // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP5]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: @@ -2622,13 +2622,13 @@ int main() { // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP34]] // CHECK3-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP34]] -// CHECK3-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP34]] +// CHECK3-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP34]] // CHECK3-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP34]] // CHECK3-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP34]] -// CHECK3-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then4: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]), !llvm.access.group [[ACC_GRP34]] // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] @@ -2662,13 +2662,13 @@ int main() { // CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK3-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK3-NEXT: [[FROMBOOL11:%.*]] = zext i1 [[TOBOOL9]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 +// CHECK3-NEXT: [[LOADEDV9:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK3-NEXT: [[STOREDV11:%.*]] = zext i1 [[LOADEDV9]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 // CHECK3-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED10]], align 8 // CHECK3-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL12:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] +// CHECK3-NEXT: [[LOADEDV12:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] // CHECK3: omp_if.then13: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1, i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) // CHECK3-NEXT: br label [[OMP_IF_END16:%.*]] @@ -2734,8 +2734,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -2861,8 +2861,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -2968,8 +2968,8 @@ int main() { // CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK3-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK3-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -3007,12 +3007,12 @@ int main() { // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l66() #[[ATTR2]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK3-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK3-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8 @@ -3023,42 +3023,42 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1 -// CHECK3-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1 +// CHECK3-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP23]] to i1 +// CHECK3-NEXT: [[TMP24:%.*]] = select i1 [[LOADEDV2]], i32 0, i32 1 // CHECK3-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0 -// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP26]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK3-NEXT: store i32 1, ptr [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK3-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK3-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK3-NEXT: store ptr @.offload_sizes.4, ptr [[TMP30]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK3-NEXT: store ptr @.offload_maptypes.5, ptr [[TMP31]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK3-NEXT: store ptr null, ptr [[TMP32]], align 8 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK3-NEXT: store i64 100, ptr [[TMP34]], align 8 -// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK3-NEXT: store i64 0, ptr [[TMP35]], align 8 -// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK3-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK3-NEXT: store i32 0, ptr [[TMP38]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK3-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS4]]) // CHECK3-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK3-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK3: omp_offload.failed6: +// CHECK3-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK3: omp_offload.failed5: // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70(i64 [[TMP17]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK3: omp_offload.cont7: +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK3: omp_offload.cont6: // CHECK3-NEXT: ret i32 0 // // @@ -3378,9 +3378,9 @@ int main() { // CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK3-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined, i64 [[TMP1]]) // CHECK3-NEXT: ret void @@ -3435,8 +3435,8 @@ int main() { // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP54]] // CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP54]] -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP54]] // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] @@ -3690,8 +3690,8 @@ int main() { // CHECK5-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK5-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -3798,8 +3798,8 @@ int main() { // CHECK5-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK5-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -3972,60 +3972,60 @@ int main() { // CHECK7-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK7-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK7-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK7-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK7-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK7-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK7: omp_if.then: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK7: omp.inner.for.cond22: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK7: omp.inner.for.cond21: // CHECK7-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15:![0-9]+]] // CHECK7-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK7-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK7-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK7: omp.inner.for.body24: +// CHECK7-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK7-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK7: omp.inner.for.body23: // CHECK7-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK7-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK7-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK7-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK7-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK7-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP15]] // CHECK7-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP15]] -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK7: omp.body.continue27: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK7: omp.inner.for.inc28: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK7: omp.body.continue26: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK7: omp.inner.for.inc27: // CHECK7-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK7-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK7-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK7: omp.inner.for.end30: +// CHECK7-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK7-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK7: omp.inner.for.end29: // CHECK7-NEXT: br label [[OMP_IF_END:%.*]] // CHECK7: omp_if.else: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK7: omp.inner.for.cond31: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK7: omp.inner.for.cond30: // CHECK7-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK7-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK7-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK7: omp.inner.for.body33: +// CHECK7-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK7: omp.inner.for.body32: // CHECK7-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK7-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK7-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK7-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK7-NEXT: call void @_Z3fn6v() -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK7: omp.body.continue36: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK7: omp.inner.for.inc37: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK7: omp.body.continue35: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK7: omp.inner.for.inc36: // CHECK7-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK7-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK7: omp.inner.for.end39: +// CHECK7-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK7-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK7: omp.inner.for.end38: // CHECK7-NEXT: br label [[OMP_IF_END]] // CHECK7: omp_if.end: // CHECK7-NEXT: store i32 100, ptr [[I20]], align 4 @@ -4109,8 +4109,8 @@ int main() { // CHECK7-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK7-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -4565,8 +4565,8 @@ int main() { // CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK9-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK9-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -4604,16 +4604,16 @@ int main() { // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR2]] // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK9-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK9-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK9-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK9-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK9-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK9: omp_if.then: // CHECK9-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK9-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -4624,42 +4624,42 @@ int main() { // CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK9-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK9-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK9-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK9-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK9-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK9-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK9-NEXT: store ptr @.offload_sizes.1, ptr [[TMP31]], align 8 -// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK9-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP32]], align 8 -// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK9-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK9-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK9-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK9-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK9-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK9-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK9-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK9-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK9-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK9-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK9-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK9-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS5]]) // CHECK9-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK9-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK9: omp_offload.failed7: +// CHECK9-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK9: omp_offload.failed6: // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK9: omp_offload.cont8: +// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK9: omp_offload.cont7: // CHECK9-NEXT: br label [[OMP_IF_END:%.*]] // CHECK9: omp_if.else: // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] @@ -4986,9 +4986,9 @@ int main() { // CHECK9-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP1]]) // CHECK9-NEXT: ret void @@ -5043,8 +5043,8 @@ int main() { // CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP36]] // CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK9-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP36]] -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK9-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK9-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK9: omp_if.then: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP36]] // CHECK9-NEXT: br label [[OMP_IF_END:%.*]] @@ -5166,8 +5166,8 @@ int main() { // CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK9-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK9-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK9-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -5205,12 +5205,12 @@ int main() { // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l66() #[[ATTR2]] // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK9-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK9-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK9-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8 @@ -5221,42 +5221,42 @@ int main() { // CHECK9-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1 -// CHECK9-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1 +// CHECK9-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP23]] to i1 +// CHECK9-NEXT: [[TMP24:%.*]] = select i1 [[LOADEDV2]], i32 0, i32 1 // CHECK9-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0 -// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP26]], align 4 -// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK9-NEXT: store i32 1, ptr [[TMP27]], align 4 -// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8 -// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK9-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK9-NEXT: store ptr @.offload_sizes.3, ptr [[TMP30]], align 8 -// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK9-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP31]], align 8 -// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK9-NEXT: store ptr null, ptr [[TMP32]], align 8 -// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK9-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK9-NEXT: store i64 100, ptr [[TMP34]], align 8 -// CHECK9-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK9-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK9-NEXT: store i64 0, ptr [[TMP35]], align 8 -// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4 -// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK9-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4 -// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK9-NEXT: store i32 0, ptr [[TMP38]], align 4 -// CHECK9-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK9-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS4]]) // CHECK9-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK9-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK9: omp_offload.failed6: +// CHECK9-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK9: omp_offload.failed5: // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70(i64 [[TMP17]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK9: omp_offload.cont7: +// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK9: omp_offload.cont6: // CHECK9-NEXT: ret i32 0 // // @@ -5576,9 +5576,9 @@ int main() { // CHECK9-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined, i64 [[TMP1]]) // CHECK9-NEXT: ret void @@ -5633,8 +5633,8 @@ int main() { // CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP54]] // CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK9-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP54]] -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK9-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK9-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK9: omp_if.then: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP54]] // CHECK9-NEXT: br label [[OMP_IF_END:%.*]] @@ -6170,8 +6170,8 @@ int main() { // CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK11-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK11-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -6209,16 +6209,16 @@ int main() { // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR2]] // CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK11-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK11-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK11-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -6229,42 +6229,42 @@ int main() { // CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK11-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK11-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK11-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK11-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK11-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK11-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK11-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK11-NEXT: store ptr @.offload_sizes.2, ptr [[TMP31]], align 8 -// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK11-NEXT: store ptr @.offload_maptypes.3, ptr [[TMP32]], align 8 -// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK11-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK11-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK11-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK11-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK11-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK11-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK11-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK11-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK11-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK11-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK11-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS5]]) // CHECK11-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK11-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK11: omp_offload.failed7: +// CHECK11-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK11: omp_offload.failed6: // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK11: omp_offload.cont8: +// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK11: omp_offload.cont7: // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] // CHECK11: omp_if.else: // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP17]]) #[[ATTR2]] @@ -6591,9 +6591,9 @@ int main() { // CHECK11-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK11-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP1]]) // CHECK11-NEXT: ret void @@ -6640,8 +6640,8 @@ int main() { // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 // CHECK11-NEXT: [[TMP5:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP5]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK11: omp.inner.for.cond: @@ -6655,13 +6655,13 @@ int main() { // CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP34]] // CHECK11-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK11-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP34]] -// CHECK11-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP34]] +// CHECK11-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP34]] // CHECK11-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP34]] // CHECK11-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP34]] -// CHECK11-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then4: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]), !llvm.access.group [[ACC_GRP34]] // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] @@ -6695,13 +6695,13 @@ int main() { // CHECK11-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK11-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK11-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK11-NEXT: [[FROMBOOL11:%.*]] = zext i1 [[TOBOOL9]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 +// CHECK11-NEXT: [[LOADEDV9:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK11-NEXT: [[STOREDV11:%.*]] = zext i1 [[LOADEDV9]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 // CHECK11-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED10]], align 8 // CHECK11-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL12:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] +// CHECK11-NEXT: [[LOADEDV12:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] // CHECK11: omp_if.then13: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1, i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) // CHECK11-NEXT: br label [[OMP_IF_END16:%.*]] @@ -6767,8 +6767,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -6894,8 +6894,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -7001,8 +7001,8 @@ int main() { // CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK11-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK11-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK11-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -7040,12 +7040,12 @@ int main() { // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l66() #[[ATTR2]] // CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK11-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK11-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK11-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8 @@ -7056,42 +7056,42 @@ int main() { // CHECK11-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1 -// CHECK11-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1 +// CHECK11-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP23]] to i1 +// CHECK11-NEXT: [[TMP24:%.*]] = select i1 [[LOADEDV2]], i32 0, i32 1 // CHECK11-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0 -// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP26]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK11-NEXT: store i32 1, ptr [[TMP27]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK11-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8 -// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK11-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK11-NEXT: store ptr @.offload_sizes.4, ptr [[TMP30]], align 8 -// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK11-NEXT: store ptr @.offload_maptypes.5, ptr [[TMP31]], align 8 -// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK11-NEXT: store ptr null, ptr [[TMP32]], align 8 -// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK11-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK11-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK11-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK11-NEXT: store i64 100, ptr [[TMP34]], align 8 -// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK11-NEXT: store i64 0, ptr [[TMP35]], align 8 -// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4 -// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK11-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4 -// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK11-NEXT: store i32 0, ptr [[TMP38]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK11-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.region_id, ptr [[KERNEL_ARGS4]]) // CHECK11-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 -// CHECK11-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK11: omp_offload.failed6: +// CHECK11-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK11: omp_offload.failed5: // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70(i64 [[TMP17]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK11: omp_offload.cont7: +// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK11: omp_offload.cont6: // CHECK11-NEXT: ret i32 0 // // @@ -7411,9 +7411,9 @@ int main() { // CHECK11-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK11-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined, i64 [[TMP1]]) // CHECK11-NEXT: ret void @@ -7468,8 +7468,8 @@ int main() { // CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP54]] // CHECK11-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK11-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP54]] -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l70.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP54]] // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] @@ -7723,8 +7723,8 @@ int main() { // CHECK13-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK13-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK13-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK13-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK13-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK13-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK13-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -7831,8 +7831,8 @@ int main() { // CHECK13-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK13-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK13-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK13-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK13-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK13-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK13-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -8005,60 +8005,60 @@ int main() { // CHECK15-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK15-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK15-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK15-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK15-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK15-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK15-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK15-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK15-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK15-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK15: omp_if.then: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK15: omp.inner.for.cond22: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK15: omp.inner.for.cond21: // CHECK15-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15:![0-9]+]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK15-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK15-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK15: omp.inner.for.body24: +// CHECK15-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK15-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK15: omp.inner.for.body23: // CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK15-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK15-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK15-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK15-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP15]] // CHECK15-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP15]] -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK15: omp.body.continue27: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK15: omp.inner.for.inc28: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK15: omp.body.continue26: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK15: omp.inner.for.inc27: // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK15-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK15-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK15: omp.inner.for.end30: +// CHECK15-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK15-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP15]] +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK15: omp.inner.for.end29: // CHECK15-NEXT: br label [[OMP_IF_END:%.*]] // CHECK15: omp_if.else: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK15: omp.inner.for.cond31: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK15: omp.inner.for.cond30: // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK15-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK15: omp.inner.for.body33: +// CHECK15-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK15: omp.inner.for.body32: // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK15-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK15-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK15-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK15-NEXT: call void @_Z3fn6v() -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK15: omp.body.continue36: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK15: omp.inner.for.inc37: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK15: omp.body.continue35: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK15: omp.inner.for.inc36: // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK15-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK15: omp.inner.for.end39: +// CHECK15-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK15-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK15: omp.inner.for.end38: // CHECK15-NEXT: br label [[OMP_IF_END]] // CHECK15: omp_if.end: // CHECK15-NEXT: store i32 100, ptr [[I20]], align 4 @@ -8142,8 +8142,8 @@ int main() { // CHECK15-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK15-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK15-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK15-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK15-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK15-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp index e1a6aad65b796d..ccc877723ef9d7 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp @@ -363,8 +363,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -402,16 +402,16 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -422,42 +422,42 @@ int main() { // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[LOADEDV3]], i32 0, i32 1 // CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP36]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP39]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.region_id, ptr [[KERNEL_ARGS5]]) // CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] -// CHECK1: omp_offload.failed7: +// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] +// CHECK1: omp_offload.failed6: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] -// CHECK1: omp_offload.cont8: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] +// CHECK1: omp_offload.cont7: // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] // CHECK1: omp_if.else: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]] @@ -611,9 +611,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void @@ -693,8 +693,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 -// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS4:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -732,16 +732,16 @@ int main() { // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR2]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP16]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP18]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8 @@ -751,39 +751,39 @@ int main() { // CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP24]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP25]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP28]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP30]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP32]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP34]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP35]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP36]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.region_id, ptr [[KERNEL_ARGS5]]) +// CHECK1-NEXT: [[TMP37:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.region_id, ptr [[KERNEL_ARGS4]]) // CHECK1-NEXT: [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK1-NEXT: br i1 [[TMP38]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] -// CHECK1: omp_offload.failed6: +// CHECK1-NEXT: br i1 [[TMP38]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] +// CHECK1: omp_offload.failed5: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]] -// CHECK1: omp_offload.cont7: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT6]] +// CHECK1: omp_offload.cont6: // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] // CHECK1: omp_if.else: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR2]] @@ -935,9 +935,9 @@ int main() { // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined, i64 [[TMP1]]) // CHECK1-NEXT: ret void diff --git a/clang/test/OpenMP/task_member_call_codegen.cpp b/clang/test/OpenMP/task_member_call_codegen.cpp index b7e0b41b291ec2..c2ab3317ea9bd6 100644 --- a/clang/test/OpenMP/task_member_call_codegen.cpp +++ b/clang/test/OpenMP/task_member_call_codegen.cpp @@ -32,9 +32,8 @@ void c() { // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) // CHECK1-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 1, ptr @.omp_task_entry.) // CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP3]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) // CHECK1-NEXT: ret void // // @@ -46,9 +45,8 @@ void c() { // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 // CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP3]], align 8 // CHECK1-NEXT: ret void // // @@ -72,7 +70,7 @@ void c() { // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 40 // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) @@ -100,8 +98,7 @@ void c() { // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) // CHECK3-NEXT: [[TMP0:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1, i64 48, i64 1, ptr @.omp_task_entry.) // CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP0]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40 // CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], ptr [[TMP0]]) // CHECK3-NEXT: ret void @@ -115,9 +112,8 @@ void c() { // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 // CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[TMP4]], align 8 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP3]], align 8 // CHECK3-NEXT: ret void // // @@ -141,7 +137,7 @@ void c() { // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 40 // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp index aed27c47fa1d36..2a8621fac25dd0 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp @@ -482,8 +482,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -564,45 +564,45 @@ int main() { // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK1-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP45]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP46]], align 8 -// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.region_id, ptr [[KERNEL_ARGS6]]) // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK1: omp_offload.failed8: +// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97(i64 [[TMP31]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK1: omp_offload.cont9: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: // CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP56]]) // CHECK1-NEXT: ret i32 [[CALL]] @@ -898,12 +898,12 @@ int main() { // CHECK1-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined, i64 [[TMP2]]) // CHECK1-NEXT: ret void @@ -958,8 +958,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]) // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -1069,8 +1069,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -1151,45 +1151,45 @@ int main() { // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK1-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP45]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP46]], align 8 -// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.region_id, ptr [[KERNEL_ARGS6]]) // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK1: omp_offload.failed8: +// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72(i64 [[TMP31]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK1: omp_offload.cont9: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: // CHECK1-NEXT: ret i32 0 // // @@ -1483,12 +1483,12 @@ int main() { // CHECK1-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined, i64 [[TMP2]]) // CHECK1-NEXT: ret void @@ -1543,8 +1543,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]) // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp index 58c1f4155abfbb..c796b5e5948c82 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp @@ -507,8 +507,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -589,45 +589,45 @@ int main() { // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK1-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP45]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP46]], align 8 -// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK1: omp_offload.failed8: +// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK1: omp_offload.cont9: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: // CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 // CHECK1-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP56]]) // CHECK1-NEXT: ret i32 [[CALL]] @@ -951,12 +951,12 @@ int main() { // CHECK1-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP2]]) // CHECK1-NEXT: ret void @@ -1011,8 +1011,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP38]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -1136,8 +1136,8 @@ int main() { // CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -1218,45 +1218,45 @@ int main() { // CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK1-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK1-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK1-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK1-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK1-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP45]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP46]], align 8 -// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK1-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK1-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK1-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK1-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK1-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK1-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS6]]) // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK1: omp_offload.failed8: +// CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK1: omp_offload.cont9: +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: // CHECK1-NEXT: ret i32 0 // // @@ -1578,12 +1578,12 @@ int main() { // CHECK1-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK1-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK1-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK1-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined, i64 [[TMP2]]) // CHECK1-NEXT: ret void @@ -1638,8 +1638,8 @@ int main() { // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP56]] -// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK1-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK1-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK1: omp_if.then: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: br label [[OMP_IF_END:%.*]] @@ -2147,8 +2147,8 @@ int main() { // CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -2229,45 +2229,45 @@ int main() { // CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr @Arg, align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK3-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK3-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK3-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK3-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK3-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK3-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP45]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP46]], align 8 -// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK3-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK3-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK3-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK3-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK3-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK3-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK3-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) // CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK3: omp_offload.failed8: +// CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK3: omp_offload.failed7: // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK3: omp_offload.cont9: +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK3: omp_offload.cont8: // CHECK3-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP56]]) // CHECK3-NEXT: ret i32 [[CALL]] @@ -2591,12 +2591,12 @@ int main() { // CHECK3-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK3-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK3-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP2]]) // CHECK3-NEXT: ret void @@ -2643,8 +2643,8 @@ int main() { // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP5]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: @@ -2658,13 +2658,13 @@ int main() { // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] +// CHECK3-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then4: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]), !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] @@ -2698,13 +2698,13 @@ int main() { // CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK3-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK3-NEXT: [[FROMBOOL11:%.*]] = zext i1 [[TOBOOL9]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 +// CHECK3-NEXT: [[LOADEDV9:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK3-NEXT: [[STOREDV11:%.*]] = zext i1 [[LOADEDV9]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 // CHECK3-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED10]], align 8 // CHECK3-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL12:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] +// CHECK3-NEXT: [[LOADEDV12:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] // CHECK3: omp_if.then13: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1, i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) // CHECK3-NEXT: br label [[OMP_IF_END16:%.*]] @@ -2770,8 +2770,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -2897,8 +2897,8 @@ int main() { // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -3006,8 +3006,8 @@ int main() { // CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK3-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -3088,45 +3088,45 @@ int main() { // CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK3-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK3-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK3-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK3-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK3-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK3-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK3-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK3-NEXT: store ptr @.offload_sizes.2, ptr [[TMP45]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK3-NEXT: store ptr @.offload_maptypes.3, ptr [[TMP46]], align 8 -// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK3-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK3-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK3-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK3-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK3-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK3-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK3-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS6]]) // CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK3: omp_offload.failed8: +// CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK3: omp_offload.failed7: // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK3: omp_offload.cont9: +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK3: omp_offload.cont8: // CHECK3-NEXT: ret i32 0 // // @@ -3448,12 +3448,12 @@ int main() { // CHECK3-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK3-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK3-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK3-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK3-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK3-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK3-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK3-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK3-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined, i64 [[TMP2]]) // CHECK3-NEXT: ret void @@ -3508,8 +3508,8 @@ int main() { // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP55]] -// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK3-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK3-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK3-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK3: omp_if.then: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: br label [[OMP_IF_END:%.*]] @@ -3762,8 +3762,8 @@ int main() { // CHECK5-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK5-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -3870,8 +3870,8 @@ int main() { // CHECK5-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK5-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK5-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK5-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK5-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK5-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -4043,60 +4043,60 @@ int main() { // CHECK7-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK7-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK7-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK7-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK7-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK7-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK7: omp_if.then: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK7: omp.inner.for.cond22: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK7: omp.inner.for.cond21: // CHECK7-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]] // CHECK7-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK7-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK7: omp.inner.for.body24: +// CHECK7-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK7-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK7: omp.inner.for.body23: // CHECK7-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK7-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK7-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK7-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK7-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK7-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK7-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK7: omp.body.continue27: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK7: omp.inner.for.inc28: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK7: omp.body.continue26: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK7: omp.inner.for.inc27: // CHECK7-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK7-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK7: omp.inner.for.end30: +// CHECK7-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK7-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK7: omp.inner.for.end29: // CHECK7-NEXT: br label [[OMP_IF_END:%.*]] // CHECK7: omp_if.else: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK7: omp.inner.for.cond31: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK7: omp.inner.for.cond30: // CHECK7-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK7-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK7-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK7-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK7: omp.inner.for.body33: +// CHECK7-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK7-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK7: omp.inner.for.body32: // CHECK7-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK7-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK7-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK7-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK7-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK7-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK7-NEXT: call void @_Z3fn6v() -// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK7: omp.body.continue36: -// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK7: omp.inner.for.inc37: +// CHECK7-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK7: omp.body.continue35: +// CHECK7-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK7: omp.inner.for.inc36: // CHECK7-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK7-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK7: omp.inner.for.end39: +// CHECK7-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK7-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK7-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK7: omp.inner.for.end38: // CHECK7-NEXT: br label [[OMP_IF_END]] // CHECK7: omp_if.end: // CHECK7-NEXT: store i32 100, ptr [[I20]], align 4 @@ -4180,8 +4180,8 @@ int main() { // CHECK7-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK7-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK7-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK7-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK7-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK7-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK7-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK7-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK7-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -4608,8 +4608,8 @@ int main() { // CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK9-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK9-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -4690,45 +4690,45 @@ int main() { // CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP37:%.*]] = load i32, ptr @Arg, align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK9-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK9-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK9-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK9-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK9-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK9-NEXT: store ptr @.offload_sizes, ptr [[TMP45]], align 8 -// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK9-NEXT: store ptr @.offload_maptypes, ptr [[TMP46]], align 8 -// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK9-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK9-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK9-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK9-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK9-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK9-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK9-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK9-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK9-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK9-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK9-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK9-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK9-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK9-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK9-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK9-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK9-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK9-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) // CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK9: omp_offload.failed8: +// CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK9: omp_offload.failed7: // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK9: omp_offload.cont9: +// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK9: omp_offload.cont8: // CHECK9-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 // CHECK9-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP56]]) // CHECK9-NEXT: ret i32 [[CALL]] @@ -5052,12 +5052,12 @@ int main() { // CHECK9-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK9-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK9-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP2]]) // CHECK9-NEXT: ret void @@ -5112,8 +5112,8 @@ int main() { // CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK9-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP38]] -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK9-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK9-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK9: omp_if.then: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: br label [[OMP_IF_END:%.*]] @@ -5237,8 +5237,8 @@ int main() { // CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK9-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK9-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK9-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK9-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -5319,45 +5319,45 @@ int main() { // CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK9-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK9-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK9-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK9-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK9-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK9-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK9-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK9-NEXT: store ptr @.offload_sizes.1, ptr [[TMP45]], align 8 -// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK9-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP46]], align 8 -// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK9-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK9-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK9-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK9-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK9-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK9-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK9-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK9-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK9-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK9-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK9-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK9-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK9-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK9-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK9-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK9-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK9-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK9-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS6]]) // CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK9: omp_offload.failed8: +// CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK9: omp_offload.failed7: // CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] -// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK9: omp_offload.cont9: +// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK9: omp_offload.cont8: // CHECK9-NEXT: ret i32 0 // // @@ -5679,12 +5679,12 @@ int main() { // CHECK9-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK9-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK9-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK9-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK9-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK9-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK9-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK9-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK9-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK9-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined, i64 [[TMP2]]) // CHECK9-NEXT: ret void @@ -5739,8 +5739,8 @@ int main() { // CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK9-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP56]] -// CHECK9-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK9-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK9-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK9-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK9: omp_if.then: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: br label [[OMP_IF_END:%.*]] @@ -6248,8 +6248,8 @@ int main() { // CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK11-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK11-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -6330,45 +6330,45 @@ int main() { // CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP37:%.*]] = load i32, ptr @Arg, align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK11-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK11-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK11-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK11-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK11-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK11-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK11-NEXT: store ptr @.offload_sizes, ptr [[TMP45]], align 8 -// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK11-NEXT: store ptr @.offload_maptypes, ptr [[TMP46]], align 8 -// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK11-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK11-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK11-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK11-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK11-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK11-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK11-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK11-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK11-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK11-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK11-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK11-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK11-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK11-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK11-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK11-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK11-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK11-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK11-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.region_id, ptr [[KERNEL_ARGS6]]) // CHECK11-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK11: omp_offload.failed8: +// CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK11: omp_offload.failed7: // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK11: omp_offload.cont9: +// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK11: omp_offload.cont8: // CHECK11-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP56]]) // CHECK11-NEXT: ret i32 [[CALL]] @@ -6692,12 +6692,12 @@ int main() { // CHECK11-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK11-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK11-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined, i64 [[TMP2]]) // CHECK11-NEXT: ret void @@ -6744,8 +6744,8 @@ int main() { // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 // CHECK11-NEXT: [[TMP5:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP5]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE5:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK11: omp.inner.for.cond: @@ -6759,13 +6759,13 @@ int main() { // CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // CHECK11-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] +// CHECK11-NEXT: [[LOADEDV2:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[LOADEDV2]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV3:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV3]], label [[OMP_IF_THEN4:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then4: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]), !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] @@ -6799,13 +6799,13 @@ int main() { // CHECK11-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK11-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 // CHECK11-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL9:%.*]] = trunc i8 [[TMP24]] to i1 -// CHECK11-NEXT: [[FROMBOOL11:%.*]] = zext i1 [[TOBOOL9]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 +// CHECK11-NEXT: [[LOADEDV9:%.*]] = trunc i8 [[TMP24]] to i1 +// CHECK11-NEXT: [[STOREDV11:%.*]] = zext i1 [[LOADEDV9]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV11]], ptr [[DOTCAPTURE_EXPR__CASTED10]], align 1 // CHECK11-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED10]], align 8 // CHECK11-NEXT: [[TMP26:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL12:%.*]] = trunc i8 [[TMP26]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] +// CHECK11-NEXT: [[LOADEDV12:%.*]] = trunc i8 [[TMP26]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV12]], label [[OMP_IF_THEN13:%.*]], label [[OMP_IF_ELSE14:%.*]] // CHECK11: omp_if.then13: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1, i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) // CHECK11-NEXT: br label [[OMP_IF_END16:%.*]] @@ -6871,8 +6871,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -6998,8 +6998,8 @@ int main() { // CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 // CHECK11-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -7107,8 +7107,8 @@ int main() { // CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 // CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 -// CHECK11-NEXT: [[_TMP6:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK11-NEXT: [[_TMP5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 // CHECK11-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP0]], align 4 @@ -7189,45 +7189,45 @@ int main() { // CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 // CHECK11-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP38:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP38]] to i1 -// CHECK11-NEXT: [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP38]] to i1 +// CHECK11-NEXT: [[TMP39:%.*]] = select i1 [[LOADEDV]], i32 0, i32 1 // CHECK11-NEXT: [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0 -// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 // CHECK11-NEXT: store i32 3, ptr [[TMP41]], align 4 -// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 // CHECK11-NEXT: store i32 1, ptr [[TMP42]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 // CHECK11-NEXT: store ptr [[TMP35]], ptr [[TMP43]], align 8 -// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 // CHECK11-NEXT: store ptr [[TMP36]], ptr [[TMP44]], align 8 -// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 // CHECK11-NEXT: store ptr @.offload_sizes.2, ptr [[TMP45]], align 8 -// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 // CHECK11-NEXT: store ptr @.offload_maptypes.3, ptr [[TMP46]], align 8 -// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 // CHECK11-NEXT: store ptr null, ptr [[TMP47]], align 8 -// CHECK11-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 7 +// CHECK11-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 // CHECK11-NEXT: store ptr null, ptr [[TMP48]], align 8 -// CHECK11-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 8 +// CHECK11-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 // CHECK11-NEXT: store i64 100, ptr [[TMP49]], align 8 -// CHECK11-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 9 +// CHECK11-NEXT: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 // CHECK11-NEXT: store i64 0, ptr [[TMP50]], align 8 -// CHECK11-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 10 +// CHECK11-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 // CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP51]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 11 +// CHECK11-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 // CHECK11-NEXT: store [3 x i32] [[TMP40]], ptr [[TMP52]], align 4 -// CHECK11-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 12 +// CHECK11-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 // CHECK11-NEXT: store i32 0, ptr [[TMP53]], align 4 -// CHECK11-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS7]]) +// CHECK11-NEXT: [[TMP54:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP39]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.region_id, ptr [[KERNEL_ARGS6]]) // CHECK11-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] -// CHECK11: omp_offload.failed8: +// CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK11: omp_offload.failed7: // CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT9]] -// CHECK11: omp_offload.cont9: +// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK11: omp_offload.cont8: // CHECK11-NEXT: ret i32 0 // // @@ -7549,12 +7549,12 @@ int main() { // CHECK11-NEXT: store i64 [[ARG]], ptr [[ARG_ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 -// CHECK11-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK11-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK11-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK11-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP1]] to i1 -// CHECK11-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK11-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1 +// CHECK11-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK11-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK11-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined, i64 [[TMP2]]) // CHECK11-NEXT: ret void @@ -7609,8 +7609,8 @@ int main() { // CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 // CHECK11-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP55]] -// CHECK11-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1 -// CHECK11-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK11-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP11]] to i1 +// CHECK11-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK11: omp_if.then: // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]]), !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: br label [[OMP_IF_END:%.*]] @@ -7863,8 +7863,8 @@ int main() { // CHECK13-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK13-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK13-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK13-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK13-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK13-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK13-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -7971,8 +7971,8 @@ int main() { // CHECK13-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK13-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK13-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK13-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK13-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK13-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK13-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 @@ -8144,60 +8144,60 @@ int main() { // CHECK15-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr @Arg, align 4 // CHECK15-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK15-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK15-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK15-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK15-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP12:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK15-NEXT: [[TOBOOL21:%.*]] = trunc i8 [[TMP12]] to i1 -// CHECK15-NEXT: br i1 [[TOBOOL21]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK15-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP12]] to i1 +// CHECK15-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK15: omp_if.then: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22:%.*]] -// CHECK15: omp.inner.for.cond22: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21:%.*]] +// CHECK15: omp.inner.for.cond21: // CHECK15-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[CMP23:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK15-NEXT: br i1 [[CMP23]], label [[OMP_INNER_FOR_BODY24:%.*]], label [[OMP_INNER_FOR_END30:%.*]] -// CHECK15: omp.inner.for.body24: +// CHECK15-NEXT: [[CMP22:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK15-NEXT: br i1 [[CMP22]], label [[OMP_INNER_FOR_BODY23:%.*]], label [[OMP_INNER_FOR_END29:%.*]] +// CHECK15: omp.inner.for.body23: // CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[MUL25:%.*]] = mul nsw i32 [[TMP15]], 1 -// CHECK15-NEXT: [[ADD26:%.*]] = add nsw i32 0, [[MUL25]] -// CHECK15-NEXT: store i32 [[ADD26]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[ADD25:%.*]] = add nsw i32 0, [[MUL24]] +// CHECK15-NEXT: store i32 [[ADD25]], ptr [[I20]], align 4, !llvm.access.group [[ACC_GRP14]] // CHECK15-NEXT: call void @_Z3fn6v(), !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE27:%.*]] -// CHECK15: omp.body.continue27: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC28:%.*]] -// CHECK15: omp.inner.for.inc28: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE26:%.*]] +// CHECK15: omp.body.continue26: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC27:%.*]] +// CHECK15: omp.inner.for.inc27: // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK15-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND22]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK15: omp.inner.for.end30: +// CHECK15-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK15-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV19]], align 4, !llvm.access.group [[ACC_GRP14]] +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND21]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK15: omp.inner.for.end29: // CHECK15-NEXT: br label [[OMP_IF_END:%.*]] // CHECK15: omp_if.else: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31:%.*]] -// CHECK15: omp.inner.for.cond31: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] +// CHECK15: omp.inner.for.cond30: // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB18]], align 4 -// CHECK15-NEXT: [[CMP32:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END39:%.*]] -// CHECK15: omp.inner.for.body33: +// CHECK15-NEXT: [[CMP31:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END38:%.*]] +// CHECK15: omp.inner.for.body32: // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[MUL34:%.*]] = mul nsw i32 [[TMP19]], 1 -// CHECK15-NEXT: [[ADD35:%.*]] = add nsw i32 0, [[MUL34]] -// CHECK15-NEXT: store i32 [[ADD35]], ptr [[I20]], align 4 +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i32 [[TMP19]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i32 0, [[MUL33]] +// CHECK15-NEXT: store i32 [[ADD34]], ptr [[I20]], align 4 // CHECK15-NEXT: call void @_Z3fn6v() -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE36:%.*]] -// CHECK15: omp.body.continue36: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC37:%.*]] -// CHECK15: omp.inner.for.inc37: +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE35:%.*]] +// CHECK15: omp.body.continue35: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC36:%.*]] +// CHECK15: omp.inner.for.inc36: // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK15-NEXT: store i32 [[ADD38]], ptr [[DOTOMP_IV19]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND31]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK15: omp.inner.for.end39: +// CHECK15-NEXT: [[ADD37:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK15-NEXT: store i32 [[ADD37]], ptr [[DOTOMP_IV19]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK15: omp.inner.for.end38: // CHECK15-NEXT: br label [[OMP_IF_END]] // CHECK15: omp_if.end: // CHECK15-NEXT: store i32 100, ptr [[I20]], align 4 @@ -8281,8 +8281,8 @@ int main() { // CHECK15-NEXT: store i32 100, ptr [[I6]], align 4 // CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 // CHECK15-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP10]], 0 -// CHECK15-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK15-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK15-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK15-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB17]], align 4 // CHECK15-NEXT: store i32 99, ptr [[DOTOMP_UB18]], align 4 // CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB17]], align 4 diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp index 2fab0cff55373a..d8724dd21b7839 100644 --- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp @@ -2397,12 +2397,12 @@ int main (int argc, char **argv) { // CHECK21-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1 // CHECK21-NEXT: [[TMP1:%.*]] = load float, ptr [[B]], align 4 // CHECK21-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP1]], 0.000000e+00 -// CHECK21-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK21-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK21-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK21-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK21-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK21-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK21-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK21-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK21-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK21-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK21-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK21-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK21-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined, ptr [[TMP0]], i64 [[TMP3]]) // CHECK21-NEXT: ret void @@ -2448,8 +2448,8 @@ int main (int argc, char **argv) { // CHECK21-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK21-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 // CHECK21-NEXT: [[TMP6:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK21-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP6]] to i1 -// CHECK21-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK21-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// CHECK21-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK21: omp_if.then: // CHECK21-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK21: omp.inner.for.cond: @@ -2625,12 +2625,12 @@ int main (int argc, char **argv) { // CHECK23-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1 // CHECK23-NEXT: [[TMP1:%.*]] = load float, ptr [[B]], align 4 // CHECK23-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP1]], 0.000000e+00 -// CHECK23-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK23-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK23-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK23-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK23-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK23-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK23-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK23-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK23-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK23-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK23-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK23-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK23-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined, ptr [[TMP0]], i32 [[TMP3]]) // CHECK23-NEXT: ret void @@ -2676,8 +2676,8 @@ int main (int argc, char **argv) { // CHECK23-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK23-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 // CHECK23-NEXT: [[TMP6:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK23-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP6]] to i1 -// CHECK23-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK23-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// CHECK23-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK23: omp_if.then: // CHECK23-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK23: omp.inner.for.cond: @@ -2888,15 +2888,15 @@ int main (int argc, char **argv) { // CHECK29-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 1 // CHECK29-NEXT: [[TMP0:%.*]] = load float, ptr [[B]], align 4 // CHECK29-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00 -// CHECK29-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK29-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK29-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK29-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK29-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK29-NEXT: store i32 122, ptr [[DOTOMP_UB]], align 4 // CHECK29-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK29-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_IV]], align 4 // CHECK29-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK29-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK29-NEXT: br i1 [[TOBOOL2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK29-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK29-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK29: omp_if.then: // CHECK29-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK29: omp.inner.for.cond: @@ -2909,8 +2909,8 @@ int main (int argc, char **argv) { // CHECK29-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP5]], 1 // CHECK29-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK29-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK29-NEXT: [[B3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK29-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal [[META3:![0-9]+]], !llvm.access.group [[ACC_GRP2]] +// CHECK29-NEXT: [[B2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 +// CHECK29-NEXT: [[TMP6:%.*]] = load float, ptr [[B2]], align 4, !nontemporal [[META3:![0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK29-NEXT: [[CONV:%.*]] = fptosi float [[TMP6]] to i32 // CHECK29-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK29-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] @@ -2922,46 +2922,46 @@ int main (int argc, char **argv) { // CHECK29-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK29: omp.inner.for.inc: // CHECK29-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK29-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK29-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK29-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK29-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK29-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] // CHECK29: omp.inner.for.end: // CHECK29-NEXT: br label [[OMP_IF_END:%.*]] // CHECK29: omp_if.else: -// CHECK29-NEXT: br label [[OMP_INNER_FOR_COND5:%.*]] -// CHECK29: omp.inner.for.cond5: +// CHECK29-NEXT: br label [[OMP_INNER_FOR_COND4:%.*]] +// CHECK29: omp.inner.for.cond4: // CHECK29-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK29-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK29-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] -// CHECK29-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY7:%.*]], label [[OMP_INNER_FOR_END18:%.*]] -// CHECK29: omp.inner.for.body7: +// CHECK29-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK29-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY6:%.*]], label [[OMP_INNER_FOR_END17:%.*]] +// CHECK29: omp.inner.for.body6: // CHECK29-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK29-NEXT: [[MUL8:%.*]] = mul nsw i32 [[TMP11]], 1 -// CHECK29-NEXT: [[ADD9:%.*]] = add nsw i32 0, [[MUL8]] -// CHECK29-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 -// CHECK29-NEXT: [[B10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK29-NEXT: [[TMP12:%.*]] = load float, ptr [[B10]], align 4 -// CHECK29-NEXT: [[CONV11:%.*]] = fptosi float [[TMP12]] to i32 -// CHECK29-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 +// CHECK29-NEXT: [[MUL7:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK29-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK29-NEXT: store i32 [[ADD8]], ptr [[I]], align 4 +// CHECK29-NEXT: [[B9:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 +// CHECK29-NEXT: [[TMP12:%.*]] = load float, ptr [[B9]], align 4 +// CHECK29-NEXT: [[CONV10:%.*]] = fptosi float [[TMP12]] to i32 +// CHECK29-NEXT: [[A11:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK29-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 -// CHECK29-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK29-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [123 x i32], ptr [[A12]], i64 0, i64 [[IDXPROM13]] -// CHECK29-NEXT: store i32 [[CONV11]], ptr [[ARRAYIDX14]], align 4 -// CHECK29-NEXT: br label [[OMP_BODY_CONTINUE15:%.*]] -// CHECK29: omp.body.continue15: -// CHECK29-NEXT: br label [[OMP_INNER_FOR_INC16:%.*]] -// CHECK29: omp.inner.for.inc16: +// CHECK29-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK29-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [123 x i32], ptr [[A11]], i64 0, i64 [[IDXPROM12]] +// CHECK29-NEXT: store i32 [[CONV10]], ptr [[ARRAYIDX13]], align 4 +// CHECK29-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] +// CHECK29: omp.body.continue14: +// CHECK29-NEXT: br label [[OMP_INNER_FOR_INC15:%.*]] +// CHECK29: omp.inner.for.inc15: // CHECK29-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK29-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP14]], 1 -// CHECK29-NEXT: store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4 -// CHECK29-NEXT: br label [[OMP_INNER_FOR_COND5]], !llvm.loop [[LOOP7:![0-9]+]] -// CHECK29: omp.inner.for.end18: +// CHECK29-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP14]], 1 +// CHECK29-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 +// CHECK29-NEXT: br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK29: omp.inner.for.end17: // CHECK29-NEXT: br label [[OMP_IF_END]] // CHECK29: omp_if.end: // CHECK29-NEXT: store i32 123, ptr [[I]], align 4 -// CHECK29-NEXT: [[A19:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 -// CHECK29-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [123 x i32], ptr [[A19]], i64 0, i64 0 -// CHECK29-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 +// CHECK29-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 +// CHECK29-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [123 x i32], ptr [[A18]], i64 0, i64 0 +// CHECK29-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4 // CHECK29-NEXT: ret i32 [[TMP15]] // // @@ -2988,15 +2988,15 @@ int main (int argc, char **argv) { // CHECK31-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 1 // CHECK31-NEXT: [[TMP0:%.*]] = load float, ptr [[B]], align 4 // CHECK31-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00 -// CHECK31-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK31-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK31-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK31-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK31-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK31-NEXT: store i32 122, ptr [[DOTOMP_UB]], align 4 // CHECK31-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK31-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_IV]], align 4 // CHECK31-NEXT: [[TMP2:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK31-NEXT: [[TOBOOL2:%.*]] = trunc i8 [[TMP2]] to i1 -// CHECK31-NEXT: br i1 [[TOBOOL2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK31-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1 +// CHECK31-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK31: omp_if.then: // CHECK31-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK31: omp.inner.for.cond: @@ -3009,8 +3009,8 @@ int main (int argc, char **argv) { // CHECK31-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP5]], 1 // CHECK31-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK31-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK31-NEXT: [[B3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK31-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal [[META4:![0-9]+]], !llvm.access.group [[ACC_GRP3]] +// CHECK31-NEXT: [[B2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 +// CHECK31-NEXT: [[TMP6:%.*]] = load float, ptr [[B2]], align 4, !nontemporal [[META4:![0-9]+]], !llvm.access.group [[ACC_GRP3]] // CHECK31-NEXT: [[CONV:%.*]] = fptosi float [[TMP6]] to i32 // CHECK31-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK31-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]] @@ -3021,45 +3021,45 @@ int main (int argc, char **argv) { // CHECK31-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK31: omp.inner.for.inc: // CHECK31-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK31-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK31-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK31-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK31-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] // CHECK31-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK31: omp.inner.for.end: // CHECK31-NEXT: br label [[OMP_IF_END:%.*]] // CHECK31: omp_if.else: -// CHECK31-NEXT: br label [[OMP_INNER_FOR_COND5:%.*]] -// CHECK31: omp.inner.for.cond5: +// CHECK31-NEXT: br label [[OMP_INNER_FOR_COND4:%.*]] +// CHECK31: omp.inner.for.cond4: // CHECK31-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK31-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK31-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] -// CHECK31-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY7:%.*]], label [[OMP_INNER_FOR_END17:%.*]] -// CHECK31: omp.inner.for.body7: +// CHECK31-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK31-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY6:%.*]], label [[OMP_INNER_FOR_END16:%.*]] +// CHECK31: omp.inner.for.body6: // CHECK31-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK31-NEXT: [[MUL8:%.*]] = mul nsw i32 [[TMP11]], 1 -// CHECK31-NEXT: [[ADD9:%.*]] = add nsw i32 0, [[MUL8]] -// CHECK31-NEXT: store i32 [[ADD9]], ptr [[I]], align 4 -// CHECK31-NEXT: [[B10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK31-NEXT: [[TMP12:%.*]] = load float, ptr [[B10]], align 4 -// CHECK31-NEXT: [[CONV11:%.*]] = fptosi float [[TMP12]] to i32 -// CHECK31-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 +// CHECK31-NEXT: [[MUL7:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK31-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK31-NEXT: store i32 [[ADD8]], ptr [[I]], align 4 +// CHECK31-NEXT: [[B9:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 +// CHECK31-NEXT: [[TMP12:%.*]] = load float, ptr [[B9]], align 4 +// CHECK31-NEXT: [[CONV10:%.*]] = fptosi float [[TMP12]] to i32 +// CHECK31-NEXT: [[A11:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK31-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 -// CHECK31-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [123 x i32], ptr [[A12]], i32 0, i32 [[TMP13]] -// CHECK31-NEXT: store i32 [[CONV11]], ptr [[ARRAYIDX13]], align 4 -// CHECK31-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] -// CHECK31: omp.body.continue14: -// CHECK31-NEXT: br label [[OMP_INNER_FOR_INC15:%.*]] -// CHECK31: omp.inner.for.inc15: +// CHECK31-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [123 x i32], ptr [[A11]], i32 0, i32 [[TMP13]] +// CHECK31-NEXT: store i32 [[CONV10]], ptr [[ARRAYIDX12]], align 4 +// CHECK31-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] +// CHECK31: omp.body.continue13: +// CHECK31-NEXT: br label [[OMP_INNER_FOR_INC14:%.*]] +// CHECK31: omp.inner.for.inc14: // CHECK31-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK31-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP14]], 1 -// CHECK31-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 -// CHECK31-NEXT: br label [[OMP_INNER_FOR_COND5]], !llvm.loop [[LOOP8:![0-9]+]] -// CHECK31: omp.inner.for.end17: +// CHECK31-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP14]], 1 +// CHECK31-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_IV]], align 4 +// CHECK31-NEXT: br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK31: omp.inner.for.end16: // CHECK31-NEXT: br label [[OMP_IF_END]] // CHECK31: omp_if.end: // CHECK31-NEXT: store i32 123, ptr [[I]], align 4 -// CHECK31-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 -// CHECK31-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [123 x i32], ptr [[A18]], i32 0, i32 0 -// CHECK31-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4 +// CHECK31-NEXT: [[A17:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 +// CHECK31-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [123 x i32], ptr [[A17]], i32 0, i32 0 +// CHECK31-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4 // CHECK31-NEXT: ret i32 [[TMP15]] // // @@ -4010,12 +4010,12 @@ int main (int argc, char **argv) { // CHECK37-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8 // CHECK37-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK37-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0 -// CHECK37-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK37-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK37-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK37-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK37-NEXT: [[TMP3:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK37-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP3]] to i1 -// CHECK37-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK37-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK37-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// CHECK37-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK37-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK37-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK37-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined, ptr [[N_ADDR]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP4]]) // CHECK37-NEXT: ret void @@ -4085,8 +4085,8 @@ int main (int argc, char **argv) { // CHECK37-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK37-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4 // CHECK37-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK37-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK37-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK37-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK37-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK37: omp_if.then: // CHECK37-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK37: omp.inner.for.cond: @@ -4474,12 +4474,12 @@ int main (int argc, char **argv) { // CHECK39-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4 // CHECK39-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK39-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0 -// CHECK39-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK39-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK39-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK39-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK39-NEXT: [[TMP3:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK39-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP3]] to i1 -// CHECK39-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8 -// CHECK39-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 +// CHECK39-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// CHECK39-NEXT: [[STOREDV1:%.*]] = zext i1 [[LOADEDV]] to i8 +// CHECK39-NEXT: store i8 [[STOREDV1]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 // CHECK39-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK39-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined, ptr [[N_ADDR]], i32 [[TMP0]], ptr [[TMP1]], i32 [[TMP4]]) // CHECK39-NEXT: ret void @@ -4549,8 +4549,8 @@ int main (int argc, char **argv) { // CHECK39-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK39-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4 // CHECK39-NEXT: [[TMP14:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1 -// CHECK39-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1 -// CHECK39-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK39-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP14]] to i1 +// CHECK39-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK39: omp_if.then: // CHECK39-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK39: omp.inner.for.cond: @@ -5091,8 +5091,8 @@ int main (int argc, char **argv) { // CHECK45-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8 // CHECK45-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK45-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP3]], 0 -// CHECK45-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK45-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK45-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK45-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK45-NEXT: [[TMP4:%.*]] = load i32, ptr [[N]], align 4 // CHECK45-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4 // CHECK45-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -5111,15 +5111,15 @@ int main (int argc, char **argv) { // CHECK45-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK45-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4 // CHECK45-NEXT: [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK45-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP9]] to i1 -// CHECK45-NEXT: br i1 [[TOBOOL5]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK45-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP9]] to i1 +// CHECK45-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK45: omp_if.then: // CHECK45-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK45: omp.inner.for.cond: // CHECK45-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2:![0-9]+]] // CHECK45-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK45-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK45-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK45-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] +// CHECK45-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK45: omp.inner.for.body: // CHECK45-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK45-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 @@ -5134,44 +5134,44 @@ int main (int argc, char **argv) { // CHECK45-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK45: omp.inner.for.inc: // CHECK45-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK45-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP14]], 1 -// CHECK45-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] +// CHECK45-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP14]], 1 +// CHECK45-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK45-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK45: omp.inner.for.end: // CHECK45-NEXT: br label [[OMP_IF_END:%.*]] // CHECK45: omp_if.else: -// CHECK45-NEXT: br label [[OMP_INNER_FOR_COND8:%.*]] -// CHECK45: omp.inner.for.cond8: +// CHECK45-NEXT: br label [[OMP_INNER_FOR_COND7:%.*]] +// CHECK45: omp.inner.for.cond7: // CHECK45-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK45-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK45-NEXT: [[CMP9:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] -// CHECK45-NEXT: br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY10:%.*]], label [[OMP_INNER_FOR_END18:%.*]] -// CHECK45: omp.inner.for.body10: +// CHECK45-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]] +// CHECK45-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY9:%.*]], label [[OMP_INNER_FOR_END17:%.*]] +// CHECK45: omp.inner.for.body9: // CHECK45-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK45-NEXT: [[MUL11:%.*]] = mul nsw i32 [[TMP17]], 1 -// CHECK45-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] -// CHECK45-NEXT: store i32 [[ADD12]], ptr [[I4]], align 4 +// CHECK45-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP17]], 1 +// CHECK45-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK45-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 // CHECK45-NEXT: [[TMP18:%.*]] = load i32, ptr [[I4]], align 4 -// CHECK45-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP18]] to i64 -// CHECK45-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[IDXPROM13]] -// CHECK45-NEXT: store i32 0, ptr [[ARRAYIDX14]], align 4 -// CHECK45-NEXT: br label [[OMP_BODY_CONTINUE15:%.*]] -// CHECK45: omp.body.continue15: -// CHECK45-NEXT: br label [[OMP_INNER_FOR_INC16:%.*]] -// CHECK45: omp.inner.for.inc16: +// CHECK45-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP18]] to i64 +// CHECK45-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[IDXPROM12]] +// CHECK45-NEXT: store i32 0, ptr [[ARRAYIDX13]], align 4 +// CHECK45-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] +// CHECK45: omp.body.continue14: +// CHECK45-NEXT: br label [[OMP_INNER_FOR_INC15:%.*]] +// CHECK45: omp.inner.for.inc15: // CHECK45-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK45-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP19]], 1 -// CHECK45-NEXT: store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4 -// CHECK45-NEXT: br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP6:![0-9]+]] -// CHECK45: omp.inner.for.end18: +// CHECK45-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP19]], 1 +// CHECK45-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 +// CHECK45-NEXT: br label [[OMP_INNER_FOR_COND7]], !llvm.loop [[LOOP6:![0-9]+]] +// CHECK45: omp.inner.for.end17: // CHECK45-NEXT: br label [[OMP_IF_END]] // CHECK45: omp_if.end: // CHECK45-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK45-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK45-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 -// CHECK45-NEXT: [[MUL21:%.*]] = mul nsw i32 [[DIV20]], 1 -// CHECK45-NEXT: [[ADD22:%.*]] = add nsw i32 0, [[MUL21]] -// CHECK45-NEXT: store i32 [[ADD22]], ptr [[I4]], align 4 +// CHECK45-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK45-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK45-NEXT: [[MUL20:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK45-NEXT: [[ADD21:%.*]] = add nsw i32 0, [[MUL20]] +// CHECK45-NEXT: store i32 [[ADD21]], ptr [[I4]], align 4 // CHECK45-NEXT: br label [[SIMD_IF_END]] // CHECK45: simd.if.end: // CHECK45-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -5259,8 +5259,8 @@ int main (int argc, char **argv) { // CHECK47-NEXT: store i32 [[TMP0]], ptr [[__VLA_EXPR0]], align 4 // CHECK47-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 // CHECK47-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0 -// CHECK47-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8 -// CHECK47-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1 +// CHECK47-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8 +// CHECK47-NEXT: store i8 [[STOREDV]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK47-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4 // CHECK47-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_1]], align 4 // CHECK47-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -5279,15 +5279,15 @@ int main (int argc, char **argv) { // CHECK47-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK47-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4 // CHECK47-NEXT: [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 -// CHECK47-NEXT: [[TOBOOL5:%.*]] = trunc i8 [[TMP8]] to i1 -// CHECK47-NEXT: br i1 [[TOBOOL5]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] +// CHECK47-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP8]] to i1 +// CHECK47-NEXT: br i1 [[LOADEDV]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]] // CHECK47: omp_if.then: // CHECK47-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK47: omp.inner.for.cond: // CHECK47-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] // CHECK47-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK47-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] -// CHECK47-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK47-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK47-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK47: omp.inner.for.body: // CHECK47-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] // CHECK47-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 @@ -5301,43 +5301,43 @@ int main (int argc, char **argv) { // CHECK47-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK47: omp.inner.for.inc: // CHECK47-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK47-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK47-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK47-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK47-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] // CHECK47-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] // CHECK47: omp.inner.for.end: // CHECK47-NEXT: br label [[OMP_IF_END:%.*]] // CHECK47: omp_if.else: -// CHECK47-NEXT: br label [[OMP_INNER_FOR_COND8:%.*]] -// CHECK47: omp.inner.for.cond8: +// CHECK47-NEXT: br label [[OMP_INNER_FOR_COND7:%.*]] +// CHECK47: omp.inner.for.cond7: // CHECK47-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK47-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK47-NEXT: [[CMP9:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK47-NEXT: br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY10:%.*]], label [[OMP_INNER_FOR_END17:%.*]] -// CHECK47: omp.inner.for.body10: +// CHECK47-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK47-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY9:%.*]], label [[OMP_INNER_FOR_END16:%.*]] +// CHECK47: omp.inner.for.body9: // CHECK47-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK47-NEXT: [[MUL11:%.*]] = mul nsw i32 [[TMP16]], 1 -// CHECK47-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] -// CHECK47-NEXT: store i32 [[ADD12]], ptr [[I4]], align 4 +// CHECK47-NEXT: [[MUL10:%.*]] = mul nsw i32 [[TMP16]], 1 +// CHECK47-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK47-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 // CHECK47-NEXT: [[TMP17:%.*]] = load i32, ptr [[I4]], align 4 -// CHECK47-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 [[TMP17]] -// CHECK47-NEXT: store i32 0, ptr [[ARRAYIDX13]], align 4 -// CHECK47-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] -// CHECK47: omp.body.continue14: -// CHECK47-NEXT: br label [[OMP_INNER_FOR_INC15:%.*]] -// CHECK47: omp.inner.for.inc15: +// CHECK47-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 [[TMP17]] +// CHECK47-NEXT: store i32 0, ptr [[ARRAYIDX12]], align 4 +// CHECK47-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] +// CHECK47: omp.body.continue13: +// CHECK47-NEXT: br label [[OMP_INNER_FOR_INC14:%.*]] +// CHECK47: omp.inner.for.inc14: // CHECK47-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 -// CHECK47-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK47-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 -// CHECK47-NEXT: br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP7:![0-9]+]] -// CHECK47: omp.inner.for.end17: +// CHECK47-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK47-NEXT: store i32 [[ADD15]], ptr [[DOTOMP_IV]], align 4 +// CHECK47-NEXT: br label [[OMP_INNER_FOR_COND7]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK47: omp.inner.for.end16: // CHECK47-NEXT: br label [[OMP_IF_END]] // CHECK47: omp_if.end: // CHECK47-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK47-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK47-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 -// CHECK47-NEXT: [[MUL20:%.*]] = mul nsw i32 [[DIV19]], 1 -// CHECK47-NEXT: [[ADD21:%.*]] = add nsw i32 0, [[MUL20]] -// CHECK47-NEXT: store i32 [[ADD21]], ptr [[I4]], align 4 +// CHECK47-NEXT: [[SUB17:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK47-NEXT: [[DIV18:%.*]] = sdiv i32 [[SUB17]], 1 +// CHECK47-NEXT: [[MUL19:%.*]] = mul nsw i32 [[DIV18]], 1 +// CHECK47-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL19]] +// CHECK47-NEXT: store i32 [[ADD20]], ptr [[I4]], align 4 // CHECK47-NEXT: br label [[SIMD_IF_END]] // CHECK47: simd.if.end: // CHECK47-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 diff --git a/clang/test/Parser/cxx2a-concepts-requires-expr.cpp b/clang/test/Parser/cxx2a-concepts-requires-expr.cpp index 0c7f453b5c48d8..5755844a323d2c 100644 --- a/clang/test/Parser/cxx2a-concepts-requires-expr.cpp +++ b/clang/test/Parser/cxx2a-concepts-requires-expr.cpp @@ -78,7 +78,7 @@ bool r22 = requires { typename s::~s; }; template bool r23 = requires { typename identity::temp; }; -// expected-warning@-1 {{use 'template' keyword to treat 'temp' as a dependent template name}} +// expected-error@-1 {{use 'template' keyword to treat 'temp' as a dependent template name}} template bool r24 = requires { diff --git a/clang/test/Preprocessor/embed_codegen.cpp b/clang/test/Preprocessor/embed_codegen.cpp index 64110afc162d72..201bf300bc6694 100644 --- a/clang/test/Preprocessor/embed_codegen.cpp +++ b/clang/test/Preprocessor/embed_codegen.cpp @@ -43,8 +43,9 @@ a }; // CHECK: store i32 107, ptr %b, align 4 -int b = +int b = ( #embed + ) ; diff --git a/clang/test/Preprocessor/embed_constexpr.cpp b/clang/test/Preprocessor/embed_constexpr.cpp index 1cadff76b4890a..a7857641a2e8df 100644 --- a/clang/test/Preprocessor/embed_constexpr.cpp +++ b/clang/test/Preprocessor/embed_constexpr.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -Wno-c23-extensions // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -fexperimental-new-constant-interpreter -Wno-c23-extensions +// expected-no-diagnostics constexpr int value(int a, int b) { return a + b; @@ -46,7 +47,7 @@ int array[ static_assert(sizeof(array) / sizeof(int) == 'j'); constexpr int comma_expr = ( -#embed // expected-warning {{left operand of comma operator has no effect}} +#embed ); static_assert(comma_expr == 'k'); diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp index 31b622c848d6a9..cc73a88e5a657b 100644 --- a/clang/test/Preprocessor/embed_weird.cpp +++ b/clang/test/Preprocessor/embed_weird.cpp @@ -27,7 +27,7 @@ _Static_assert( _Static_assert(sizeof( #embed ) == -sizeof(unsigned char) +sizeof(int) , "" ); _Static_assert(sizeof @@ -35,9 +35,9 @@ _Static_assert(sizeof , "" ); _Static_assert(sizeof( -#embed // expected-warning {{left operand of comma operator has no effect}} +#embed ) == -sizeof(unsigned char) +sizeof(int) , "" ); @@ -73,10 +73,10 @@ void do_stuff() { // Ensure that we don't accidentally allow you to initialize an unsigned char * // from embedded data; the data is modeled as a string literal internally, but // is not actually a string literal. -const unsigned char *ptr = +const unsigned char *ptr = ( #embed // expected-warning {{left operand of comma operator has no effect}} -; // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'unsigned char'}} \ - cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'unsigned char'}} + ); // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'int'}} \ + cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'int'}} // However, there are some cases where this is fine and should work. const unsigned char *null_ptr_1 = @@ -101,11 +101,10 @@ constexpr unsigned char ch = ; static_assert(ch == 0); -void foobar(float x, char y, char z); // cxx-note {{candidate function not viable: requires 3 arguments, but 1 was provided}} - // c-note@-1 {{declared here}} -void g1() { foobar((float) // cxx-error {{no matching function for call to 'foobar'}} -#embed "numbers.txt" limit(3) // expected-warning {{left operand of comma operator has no effect}} -); // c-error {{too few arguments to function call, expected 3, have 1}} +void foobar(float x, char y, char z); +void g1() { foobar((float) +#embed "numbers.txt" limit(3) +); } #if __cplusplus diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index f0a2ef851287f0..6f470d85ca563c 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -99,7 +99,6 @@ // RUN: %clang -march=winchip2 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M32 -// CHECK_WINCHIP2_M32: #define __3dNOW__ 1 // CHECK_WINCHIP2_M32: #define __MMX__ 1 // CHECK_WINCHIP2_M32: #define __i386 1 // CHECK_WINCHIP2_M32: #define __i386__ 1 @@ -115,7 +114,6 @@ // RUN: %clang -march=c3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M32 -// CHECK_C3_M32: #define __3dNOW__ 1 // CHECK_C3_M32: #define __MMX__ 1 // CHECK_C3_M32: #define __i386 1 // CHECK_C3_M32: #define __i386__ 1 @@ -2707,8 +2705,6 @@ // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M32 -// CHECK_GEODE_M32: #define __3dNOW_A__ 1 -// CHECK_GEODE_M32: #define __3dNOW__ 1 // CHECK_GEODE_M32: #define __MMX__ 1 // CHECK_GEODE_M32: #define __geode 1 // CHECK_GEODE_M32: #define __geode__ 1 @@ -2739,7 +2735,6 @@ // RUN: %clang -march=k6-2 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M32 -// CHECK_K6_2_M32: #define __3dNOW__ 1 // CHECK_K6_2_M32: #define __MMX__ 1 // CHECK_K6_2_M32: #define __i386 1 // CHECK_K6_2_M32: #define __i386__ 1 @@ -2757,7 +2752,6 @@ // RUN: %clang -march=k6-3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M32 -// CHECK_K6_3_M32: #define __3dNOW__ 1 // CHECK_K6_3_M32: #define __MMX__ 1 // CHECK_K6_3_M32: #define __i386 1 // CHECK_K6_3_M32: #define __i386__ 1 @@ -2775,8 +2769,6 @@ // RUN: %clang -march=athlon -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M32 -// CHECK_ATHLON_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_M32: #define __3dNOW__ 1 // CHECK_ATHLON_M32: #define __MMX__ 1 // CHECK_ATHLON_M32: #define __athlon 1 // CHECK_ATHLON_M32: #define __athlon__ 1 @@ -2792,8 +2784,6 @@ // RUN: %clang -march=athlon-tbird -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M32 -// CHECK_ATHLON_TBIRD_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_TBIRD_M32: #define __3dNOW__ 1 // CHECK_ATHLON_TBIRD_M32: #define __MMX__ 1 // CHECK_ATHLON_TBIRD_M32: #define __athlon 1 // CHECK_ATHLON_TBIRD_M32: #define __athlon__ 1 @@ -2809,8 +2799,6 @@ // RUN: %clang -march=athlon-4 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M32 -// CHECK_ATHLON_4_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_4_M32: #define __3dNOW__ 1 // CHECK_ATHLON_4_M32: #define __MMX__ 1 // CHECK_ATHLON_4_M32: #define __SSE__ 1 // CHECK_ATHLON_4_M32: #define __athlon 1 @@ -2829,8 +2817,6 @@ // RUN: %clang -march=athlon-xp -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M32 -// CHECK_ATHLON_XP_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_XP_M32: #define __3dNOW__ 1 // CHECK_ATHLON_XP_M32: #define __MMX__ 1 // CHECK_ATHLON_XP_M32: #define __SSE__ 1 // CHECK_ATHLON_XP_M32: #define __athlon 1 @@ -2849,8 +2835,6 @@ // RUN: %clang -march=athlon-mp -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M32 -// CHECK_ATHLON_MP_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_MP_M32: #define __3dNOW__ 1 // CHECK_ATHLON_MP_M32: #define __MMX__ 1 // CHECK_ATHLON_MP_M32: #define __SSE__ 1 // CHECK_ATHLON_MP_M32: #define __athlon 1 @@ -2881,8 +2865,6 @@ // RUN: %clang -march=k8 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M32 -// CHECK_K8_M32: #define __3dNOW_A__ 1 -// CHECK_K8_M32: #define __3dNOW__ 1 // CHECK_K8_M32: #define __MMX__ 1 // CHECK_K8_M32: #define __SSE2__ 1 // CHECK_K8_M32: #define __SSE__ 1 @@ -2896,8 +2878,6 @@ // RUN: %clang -march=k8 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M64 -// CHECK_K8_M64: #define __3dNOW_A__ 1 -// CHECK_K8_M64: #define __3dNOW__ 1 // CHECK_K8_M64: #define __MMX__ 1 // CHECK_K8_M64: #define __SSE2_MATH__ 1 // CHECK_K8_M64: #define __SSE2__ 1 @@ -2914,8 +2894,6 @@ // RUN: %clang -march=k8-sse3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M32 -// CHECK_K8_SSE3_M32: #define __3dNOW_A__ 1 -// CHECK_K8_SSE3_M32: #define __3dNOW__ 1 // CHECK_K8_SSE3_M32: #define __MMX__ 1 // CHECK_K8_SSE3_M32: #define __SSE2__ 1 // CHECK_K8_SSE3_M32: #define __SSE3__ 1 @@ -2930,8 +2908,6 @@ // RUN: %clang -march=k8-sse3 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M64 -// CHECK_K8_SSE3_M64: #define __3dNOW_A__ 1 -// CHECK_K8_SSE3_M64: #define __3dNOW__ 1 // CHECK_K8_SSE3_M64: #define __MMX__ 1 // CHECK_K8_SSE3_M64: #define __SSE2_MATH__ 1 // CHECK_K8_SSE3_M64: #define __SSE2__ 1 @@ -2949,8 +2925,6 @@ // RUN: %clang -march=opteron -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M32 -// CHECK_OPTERON_M32: #define __3dNOW_A__ 1 -// CHECK_OPTERON_M32: #define __3dNOW__ 1 // CHECK_OPTERON_M32: #define __MMX__ 1 // CHECK_OPTERON_M32: #define __SSE2__ 1 // CHECK_OPTERON_M32: #define __SSE__ 1 @@ -2964,8 +2938,6 @@ // RUN: %clang -march=opteron -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M64 -// CHECK_OPTERON_M64: #define __3dNOW_A__ 1 -// CHECK_OPTERON_M64: #define __3dNOW__ 1 // CHECK_OPTERON_M64: #define __MMX__ 1 // CHECK_OPTERON_M64: #define __SSE2_MATH__ 1 // CHECK_OPTERON_M64: #define __SSE2__ 1 @@ -2982,8 +2954,6 @@ // RUN: %clang -march=opteron-sse3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M32 -// CHECK_OPTERON_SSE3_M32: #define __3dNOW_A__ 1 -// CHECK_OPTERON_SSE3_M32: #define __3dNOW__ 1 // CHECK_OPTERON_SSE3_M32: #define __MMX__ 1 // CHECK_OPTERON_SSE3_M32: #define __SSE2__ 1 // CHECK_OPTERON_SSE3_M32: #define __SSE3__ 1 @@ -2998,8 +2968,6 @@ // RUN: %clang -march=opteron-sse3 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M64 -// CHECK_OPTERON_SSE3_M64: #define __3dNOW_A__ 1 -// CHECK_OPTERON_SSE3_M64: #define __3dNOW__ 1 // CHECK_OPTERON_SSE3_M64: #define __MMX__ 1 // CHECK_OPTERON_SSE3_M64: #define __SSE2_MATH__ 1 // CHECK_OPTERON_SSE3_M64: #define __SSE2__ 1 @@ -3017,8 +2985,6 @@ // RUN: %clang -march=athlon64 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M32 -// CHECK_ATHLON64_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON64_M32: #define __3dNOW__ 1 // CHECK_ATHLON64_M32: #define __MMX__ 1 // CHECK_ATHLON64_M32: #define __SSE2__ 1 // CHECK_ATHLON64_M32: #define __SSE__ 1 @@ -3032,8 +2998,6 @@ // RUN: %clang -march=athlon64 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M64 -// CHECK_ATHLON64_M64: #define __3dNOW_A__ 1 -// CHECK_ATHLON64_M64: #define __3dNOW__ 1 // CHECK_ATHLON64_M64: #define __MMX__ 1 // CHECK_ATHLON64_M64: #define __SSE2_MATH__ 1 // CHECK_ATHLON64_M64: #define __SSE2__ 1 @@ -3050,8 +3014,6 @@ // RUN: %clang -march=athlon64-sse3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M32 -// CHECK_ATHLON64_SSE3_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON64_SSE3_M32: #define __3dNOW__ 1 // CHECK_ATHLON64_SSE3_M32: #define __MMX__ 1 // CHECK_ATHLON64_SSE3_M32: #define __SSE2__ 1 // CHECK_ATHLON64_SSE3_M32: #define __SSE3__ 1 @@ -3066,8 +3028,6 @@ // RUN: %clang -march=athlon64-sse3 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M64 -// CHECK_ATHLON64_SSE3_M64: #define __3dNOW_A__ 1 -// CHECK_ATHLON64_SSE3_M64: #define __3dNOW__ 1 // CHECK_ATHLON64_SSE3_M64: #define __MMX__ 1 // CHECK_ATHLON64_SSE3_M64: #define __SSE2_MATH__ 1 // CHECK_ATHLON64_SSE3_M64: #define __SSE2__ 1 @@ -3085,8 +3045,6 @@ // RUN: %clang -march=athlon-fx -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M32 -// CHECK_ATHLON_FX_M32: #define __3dNOW_A__ 1 -// CHECK_ATHLON_FX_M32: #define __3dNOW__ 1 // CHECK_ATHLON_FX_M32: #define __MMX__ 1 // CHECK_ATHLON_FX_M32: #define __SSE2__ 1 // CHECK_ATHLON_FX_M32: #define __SSE__ 1 @@ -3100,8 +3058,6 @@ // RUN: %clang -march=athlon-fx -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M64 -// CHECK_ATHLON_FX_M64: #define __3dNOW_A__ 1 -// CHECK_ATHLON_FX_M64: #define __3dNOW__ 1 // CHECK_ATHLON_FX_M64: #define __MMX__ 1 // CHECK_ATHLON_FX_M64: #define __SSE2_MATH__ 1 // CHECK_ATHLON_FX_M64: #define __SSE2__ 1 @@ -3118,8 +3074,6 @@ // RUN: %clang -march=amdfam10 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M32 -// CHECK_AMDFAM10_M32: #define __3dNOW_A__ 1 -// CHECK_AMDFAM10_M32: #define __3dNOW__ 1 // CHECK_AMDFAM10_M32: #define __LAHF_SAHF__ 1 // CHECK_AMDFAM10_M32: #define __LZCNT__ 1 // CHECK_AMDFAM10_M32: #define __MMX__ 1 @@ -3141,8 +3095,6 @@ // RUN: %clang -march=amdfam10 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64 -// CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1 -// CHECK_AMDFAM10_M64: #define __3dNOW__ 1 // CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1 // CHECK_AMDFAM10_M64: #define __LAHF_SAHF__ 1 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1 @@ -3167,8 +3119,6 @@ // RUN: %clang -march=btver1 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M32 -// CHECK_BTVER1_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BTVER1_M32-NOT: #define __3dNOW__ 1 // CHECK_BTVER1_M32: #define __LAHF_SAHF__ 1 // CHECK_BTVER1_M32: #define __LZCNT__ 1 // CHECK_BTVER1_M32: #define __MMX__ 1 @@ -3190,8 +3140,6 @@ // RUN: %clang -march=btver1 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M64 -// CHECK_BTVER1_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BTVER1_M64-NOT: #define __3dNOW__ 1 // CHECK_BTVER1_M64: #define __LAHF_SAHF__ 1 // CHECK_BTVER1_M64: #define __LZCNT__ 1 // CHECK_BTVER1_M64: #define __MMX__ 1 @@ -3215,8 +3163,6 @@ // RUN: %clang -march=btver2 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M32 -// CHECK_BTVER2_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BTVER2_M32-NOT: #define __3dNOW__ 1 // CHECK_BTVER2_M32: #define __AES__ 1 // CHECK_BTVER2_M32: #define __AVX__ 1 // CHECK_BTVER2_M32: #define __BMI__ 1 @@ -3245,8 +3191,6 @@ // RUN: %clang -march=btver2 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M64 -// CHECK_BTVER2_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BTVER2_M64-NOT: #define __3dNOW__ 1 // CHECK_BTVER2_M64: #define __AES__ 1 // CHECK_BTVER2_M64: #define __AVX__ 1 // CHECK_BTVER2_M64: #define __BMI__ 1 @@ -3277,8 +3221,6 @@ // RUN: %clang -march=bdver1 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M32 -// CHECK_BDVER1_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER1_M32-NOT: #define __3dNOW__ 1 // CHECK_BDVER1_M32: #define __AES__ 1 // CHECK_BDVER1_M32: #define __AVX__ 1 // CHECK_BDVER1_M32: #define __FMA4__ 1 @@ -3308,8 +3250,6 @@ // RUN: %clang -march=bdver1 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M64 -// CHECK_BDVER1_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER1_M64-NOT: #define __3dNOW__ 1 // CHECK_BDVER1_M64: #define __AES__ 1 // CHECK_BDVER1_M64: #define __AVX__ 1 // CHECK_BDVER1_M64: #define __FMA4__ 1 @@ -3341,8 +3281,6 @@ // RUN: %clang -march=bdver2 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M32 -// CHECK_BDVER2_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER2_M32-NOT: #define __3dNOW__ 1 // CHECK_BDVER2_M32: #define __AES__ 1 // CHECK_BDVER2_M32: #define __AVX__ 1 // CHECK_BDVER2_M32: #define __BMI__ 1 @@ -3376,8 +3314,6 @@ // RUN: %clang -march=bdver2 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M64 -// CHECK_BDVER2_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER2_M64-NOT: #define __3dNOW__ 1 // CHECK_BDVER2_M64: #define __AES__ 1 // CHECK_BDVER2_M64: #define __AVX__ 1 // CHECK_BDVER2_M64: #define __BMI__ 1 @@ -3413,8 +3349,6 @@ // RUN: %clang -march=bdver3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M32 -// CHECK_BDVER3_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER3_M32-NOT: #define __3dNOW__ 1 // CHECK_BDVER3_M32: #define __AES__ 1 // CHECK_BDVER3_M32: #define __AVX__ 1 // CHECK_BDVER3_M32: #define __BMI__ 1 @@ -3450,8 +3384,6 @@ // RUN: %clang -march=bdver3 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M64 -// CHECK_BDVER3_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER3_M64-NOT: #define __3dNOW__ 1 // CHECK_BDVER3_M64: #define __AES__ 1 // CHECK_BDVER3_M64: #define __AVX__ 1 // CHECK_BDVER3_M64: #define __BMI__ 1 @@ -3489,8 +3421,6 @@ // RUN: %clang -march=bdver4 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M32 -// CHECK_BDVER4_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER4_M32-NOT: #define __3dNOW__ 1 // CHECK_BDVER4_M32: #define __AES__ 1 // CHECK_BDVER4_M32: #define __AVX2__ 1 // CHECK_BDVER4_M32: #define __AVX__ 1 @@ -3529,8 +3459,6 @@ // RUN: %clang -march=bdver4 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M64 -// CHECK_BDVER4_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_BDVER4_M64-NOT: #define __3dNOW__ 1 // CHECK_BDVER4_M64: #define __AES__ 1 // CHECK_BDVER4_M64: #define __AVX2__ 1 // CHECK_BDVER4_M64: #define __AVX__ 1 @@ -3571,8 +3499,6 @@ // RUN: %clang -march=znver1 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER1_M32 -// CHECK_ZNVER1_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER1_M32-NOT: #define __3dNOW__ 1 // CHECK_ZNVER1_M32: #define __ADX__ 1 // CHECK_ZNVER1_M32: #define __AES__ 1 // CHECK_ZNVER1_M32: #define __AVX2__ 1 @@ -3618,8 +3544,6 @@ // RUN: %clang -march=znver1 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER1_M64 -// CHECK_ZNVER1_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER1_M64-NOT: #define __3dNOW__ 1 // CHECK_ZNVER1_M64: #define __ADX__ 1 // CHECK_ZNVER1_M64: #define __AES__ 1 // CHECK_ZNVER1_M64: #define __AVX2__ 1 @@ -3668,8 +3592,6 @@ // RUN: %clang -march=znver2 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER2_M32 -// CHECK_ZNVER2_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER2_M32-NOT: #define __3dNOW__ 1 // CHECK_ZNVER2_M32: #define __ADX__ 1 // CHECK_ZNVER2_M32: #define __AES__ 1 // CHECK_ZNVER2_M32: #define __AVX2__ 1 @@ -3719,8 +3641,6 @@ // RUN: %clang -march=znver2 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER2_M64 -// CHECK_ZNVER2_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER2_M64-NOT: #define __3dNOW__ 1 // CHECK_ZNVER2_M64: #define __ADX__ 1 // CHECK_ZNVER2_M64: #define __AES__ 1 // CHECK_ZNVER2_M64: #define __AVX2__ 1 @@ -3772,8 +3692,6 @@ // RUN: %clang -march=znver3 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER3_M32 -// CHECK_ZNVER3_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER3_M32-NOT: #define __3dNOW__ 1 // CHECK_ZNVER3_M32: #define __ADX__ 1 // CHECK_ZNVER3_M32: #define __AES__ 1 // CHECK_ZNVER3_M32: #define __AVX2__ 1 @@ -3823,8 +3741,6 @@ // RUN: %clang -march=znver3 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER3_M64 -// CHECK_ZNVER3_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER3_M64-NOT: #define __3dNOW__ 1 // CHECK_ZNVER3_M64: #define __ADX__ 1 // CHECK_ZNVER3_M64: #define __AES__ 1 // CHECK_ZNVER3_M64: #define __AVX2__ 1 @@ -3878,8 +3794,6 @@ // RUN: %clang -march=znver4 -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER4_M32 -// CHECK_ZNVER4_M32-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER4_M32-NOT: #define __3dNOW__ 1 // CHECK_ZNVER4_M32: #define __ADX__ 1 // CHECK_ZNVER4_M32: #define __AES__ 1 // CHECK_ZNVER4_M32: #define __AVX2__ 1 @@ -3944,8 +3858,6 @@ // RUN: %clang -march=znver4 -m64 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER4_M64 -// CHECK_ZNVER4_M64-NOT: #define __3dNOW_A__ 1 -// CHECK_ZNVER4_M64-NOT: #define __3dNOW__ 1 // CHECK_ZNVER4_M64: #define __ADX__ 1 // CHECK_ZNVER4_M64: #define __AES__ 1 // CHECK_ZNVER4_M64: #define __AVX2__ 1 diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c index d44b99a2b192a1..722e3e77214b64 100644 --- a/clang/test/Preprocessor/predefined-macros-no-warnings.c +++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c @@ -75,6 +75,8 @@ // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple m68k // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple m68k-linux // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple m68k-netbsd +// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple le32-nacl +// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple le64 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple ppc // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple ppc-freebsd // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple ppc-netbsd diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c index 88b6982c016572..1330ad10b4b474 100644 --- a/clang/test/Preprocessor/ptrauth_feature.c +++ b/clang/test/Preprocessor/ptrauth_feature.c @@ -1,69 +1,26 @@ -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI,NOFUNC +//// Note: preprocessor features exactly match corresponding clang driver flags. However, some flags are only intended to be used in combination with other ones. +//// For example, -fptrauth-init-fini will not affect codegen without -fptrauth-calls, but the preprocessor feature would be set anyway. -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=NOINTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI,NOFUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-intrinsics | \ +// RUN: FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,NOCALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI,NOFUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,NORETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI,NOFUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-returns | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI,NOFUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-address-discrimination | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-init-fini | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,INITFINI,NOFUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-type-discrimination | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOFUNC,NOINITFINI -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOINITFINI,NOFUNC - -// RUN: %clang_cc1 -E %s -triple=aarch64 \ -// RUN: -fptrauth-intrinsics \ -// RUN: -fptrauth-calls \ -// RUN: -fptrauth-returns \ -// RUN: -fptrauth-vtable-pointer-address-discrimination \ -// RUN: -fptrauth-vtable-pointer-type-discrimination \ -// RUN: -fptrauth-function-pointer-type-discrimination | \ -// RUN: FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOINITFINI,FUNC +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-function-pointer-type-discrimination | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,FUNC,NOINITFINI +// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-init-fini | \ +// RUN: FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,INITFINI #if __has_feature(ptrauth_intrinsics) // INTRIN: has_ptrauth_intrinsics @@ -114,16 +71,6 @@ void has_ptrauth_vtable_pointer_type_discrimination() {} void no_ptrauth_vtable_pointer_type_discrimination() {} #endif -#if __has_feature(ptrauth_init_fini) -// INITFINI: has_ptrauth_init_fini -void has_ptrauth_init_fini() {} -#else -// NOINITFINI: no_ptrauth_init_fini -void no_ptrauth_init_fini() {} -#endif - -#include - #if __has_feature(ptrauth_function_pointer_type_discrimination) // FUNC: has_ptrauth_function_pointer_type_discrimination void has_ptrauth_function_pointer_type_discrimination() {} @@ -131,3 +78,11 @@ void has_ptrauth_function_pointer_type_discrimination() {} // NOFUNC: no_ptrauth_function_pointer_type_discrimination void no_ptrauth_function_pointer_type_discrimination() {} #endif + +#if __has_feature(ptrauth_init_fini) +// INITFINI: has_ptrauth_init_fini +void has_ptrauth_init_fini() {} +#else +// NOINITFINI: no_ptrauth_init_fini +void no_ptrauth_init_fini() {} +#endif diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 34ec19c70f48af..fd718a126aaa79 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -1643,12 +1643,12 @@ // CHECK-ZFBFMIN-EXT: __riscv_zfbfmin 1000000{{$}} // RUN: %clang --target=riscv32 -menable-experimental-extensions \ -// RUN: -march=rv32i_zicfilp0p4 -E -dM %s \ +// RUN: -march=rv32i_zicfilp1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s // RUN: %clang --target=riscv64 -menable-experimental-extensions \ -// RUN: -march=rv64i_zicfilp0p4 -E -dM %s \ +// RUN: -march=rv64i_zicfilp1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s -// CHECK-ZICFILP-EXT: __riscv_zicfilp 4000{{$}} +// CHECK-ZICFILP-EXT: __riscv_zicfilp 1000000{{$}} // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32iztso1p0 -E -dM %s \ @@ -1675,12 +1675,12 @@ // CHECK-ZVFBFWMA-EXT: __riscv_zvfbfwma 1000000{{$}} // RUN: %clang -target riscv32 -menable-experimental-extensions \ -// RUN: -march=rv32izicfiss0p4 -E -dM %s \ +// RUN: -march=rv32izicfiss1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s // RUN: %clang -target riscv64 -menable-experimental-extensions \ -// RUN: -march=rv64izicfiss0p4 -E -dM %s \ +// RUN: -march=rv64izicfiss1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s -// CHECK-ZICFISS-EXT: __riscv_zicfiss 4000{{$}} +// CHECK-ZICFISS-EXT: __riscv_zicfiss 1000000{{$}} // RUN: %clang --target=riscv32 -menable-experimental-extensions \ // RUN: -march=rv32i_ssnpm1p0 -E -dM %s \ diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 6e9c968273a466..5c0b815c8ae6f9 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -348,12 +348,12 @@ // RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWPRFCHW %s -// 3DNOWPRFCHW: #define __3dNOW__ 1 +// 3DNOWPRFCHW-NOT: #define __3dNOW__ 1 // 3DNOWPRFCHW-NOT: #define __PRFCHW__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWNOPRFCHW %s -// 3DNOWNOPRFCHW: #define __3dNOW__ 1 +// 3DNOWNOPRFCHW-NOT: #define __3dNOW__ 1 // 3DNOWNOPRFCHW-NOT: #define __PRFCHW__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NO3DNOWPRFCHW %s diff --git a/clang/test/Sema/shift-count-negative.c b/clang/test/Sema/shift-count-negative.c index 97f85feed52c01..84c7625187a687 100644 --- a/clang/test/Sema/shift-count-negative.c +++ b/clang/test/Sema/shift-count-negative.c @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -x c -fsyntax-only -verify=expected,c -pedantic %s // RUN: %clang_cc1 -x c++ -fsyntax-only -verify=expected,cpp %s +// RUN: %clang_cc1 -x c -fsyntax-only -verify=expected,c -pedantic %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -x c++ -fsyntax-only -verify=expected,cpp %s -fexperimental-new-constant-interpreter + enum shiftof { X = (1<<-29) // c-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}} // cpp-error@-1 {{expression is not an integral constant expression}} diff --git a/clang/test/SemaCUDA/device-use-host-var.cu b/clang/test/SemaCUDA/device-use-host-var.cu index 7904f654d65b36..c0f9fc4e8a67a7 100644 --- a/clang/test/SemaCUDA/device-use-host-var.cu +++ b/clang/test/SemaCUDA/device-use-host-var.cu @@ -111,7 +111,8 @@ __device__ void dev_fun(int *out) { // Check ODR-use of host variables in namespace is not allowed. *out = X::host_var; // dev-error {{reference to __host__ variable 'host_var' in __device__ function}} - // Check ODR-use of static host varables in class or file scope is not allowed. + // Check ODR-use of static host variables in class or file scope is not + // allowed. *out = A::host_var; // dev-error {{reference to __host__ variable 'host_var' in __device__ function}} *out = static_host_var; // dev-error {{reference to __host__ variable 'static_host_var' in __device__ function}} diff --git a/clang/test/SemaCXX/cxx0x-noexcept-expression.cpp b/clang/test/SemaCXX/cxx0x-noexcept-expression.cpp index a01edc77e02aff..b3c102830f3595 100644 --- a/clang/test/SemaCXX/cxx0x-noexcept-expression.cpp +++ b/clang/test/SemaCXX/cxx0x-noexcept-expression.cpp @@ -127,7 +127,7 @@ void f1() { // `dependent` should be type-dependent because the noexcept-expression should be value-dependent // (it is true if T is int*, false if T is Polymorphic* for example) dependent.f(); // This should need to be `.template f` to parse as a template - // expected-warning@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} + // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} } template void f2() { @@ -135,14 +135,14 @@ void f2() { // X when T...[0] is a type with some operator&& which returns int* // X when sizeof...(T) == 0 dependent.f(); - // expected-warning@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} + // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} } template void f3() { X(nullptr)))> dependent; // X when T is int, X when T is Polymorphic dependent.f(); - // expected-warning@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} + // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}} } template void f4() { diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp index fb8385eb020519..ba80e57f814244 100644 --- a/clang/test/SemaCXX/cxx2a-consteval.cpp +++ b/clang/test/SemaCXX/cxx2a-consteval.cpp @@ -891,13 +891,13 @@ struct S { }; void func() { - // Explictly defaulted constructor. + // Explicitly defaulted constructor. S s1; S s2; // User provided constructor. S s3; S s4; - // Consteval explictly defaulted constructor. + // Consteval explicitly defaulted constructor. S s5; // expected-error {{call to consteval function 'multiple_default_constructors::S::S' is not a constant expression}} \ expected-note {{in call to 'S()'}} S s6; diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 2c19b091fabad0..5cbc1f735383b0 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -918,3 +918,44 @@ struct C { } }; } + +namespace GH85992 { +namespace N { +struct A { + int f(this A); +}; + +int f(A); +} + +struct S { + int (S::*x)(this int); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} + int (*y)(this int); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} + int (***z)(this int); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} + + int f(this S); + int ((g))(this S); + friend int h(this S); // expected-error {{an explicit object parameter cannot appear in a non-member function}} + int h(int x, int (*)(this S)); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} + + struct T { + int f(this T); + }; + + friend int T::f(this T); + friend int N::A::f(this N::A); + friend int N::f(this N::A); // expected-error {{an explicit object parameter cannot appear in a non-member function}} + int friend func(this T); // expected-error {{an explicit object parameter cannot appear in a non-member function}} +}; + +using T = int (*)(this int); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} +using U = int (S::*)(this int); // expected-error {{an explicit object parameter can only appear as the first parameter of a member function}} +int h(this int); // expected-error {{an explicit object parameter cannot appear in a non-member function}} + +int S::f(this S) { return 1; } + +namespace a { +void f(); +}; +void a::f(this auto) {} // expected-error {{an explicit object parameter cannot appear in a non-member function}} +} diff --git a/clang/test/SemaCXX/enum.cpp b/clang/test/SemaCXX/enum.cpp index 7d4a05083b9cdb..739d35ec4a06b8 100644 --- a/clang/test/SemaCXX/enum.cpp +++ b/clang/test/SemaCXX/enum.cpp @@ -1,5 +1,9 @@ // RUN: %clang_cc1 -fsyntax-only -pedantic -std=c++98 -verify -triple x86_64-apple-darwin %s // RUN: %clang_cc1 -fsyntax-only -pedantic -std=c++11 -verify -triple x86_64-apple-darwin %s + +// RUN: %clang_cc1 -fsyntax-only -pedantic -std=c++98 -verify -triple x86_64-apple-darwin %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -fsyntax-only -pedantic -std=c++11 -verify -triple x86_64-apple-darwin %s -fexperimental-new-constant-interpreter + enum E { // expected-note{{previous definition is here}} Val1, Val2 diff --git a/clang/test/SemaCXX/pseudo-destructors.cpp b/clang/test/SemaCXX/pseudo-destructors.cpp index 44dc9ce8b15208..55a96002be2abd 100644 --- a/clang/test/SemaCXX/pseudo-destructors.cpp +++ b/clang/test/SemaCXX/pseudo-destructors.cpp @@ -22,21 +22,21 @@ void cv_test(const volatile T* cvt) { void f(A* a, Foo *f, int *i, double *d, int ii) { a->~A(); a->A::~A(); - + a->~foo(); // expected-error{{undeclared identifier 'foo' in destructor name}} - + a->~Bar(); // expected-error{{destructor type 'Bar' (aka 'Foo') in object destruction expression does not match the type 'A' of the object being destroyed}} - + f->~Bar(); f->~Foo(); i->~Bar(); // expected-error{{does not match}} - + g().~Bar(); // expected-error{{non-scalar}} - + f->::~Bar(); // expected-error {{not a structure or union}} f->::Bar::~Bar(); f->N::~Wibble(); // expected-error{{'N' does not refer to a type}} expected-error{{'Wibble' does not refer to a type}} - + f->Bar::~Bar(17, 42); // expected-error{{cannot have any arguments}} i->~Integer(); @@ -148,12 +148,12 @@ namespace TwoPhaseLookup { namespace Template { template struct Y {}; template using G = Y; - template void f(T *p) { p->~G(); } // expected-error {{no member named 'G'}} + template void f(T *p) { p->~G(); } // expected-error {{no member named '~Y'}} void h1(Y *p) { p->~G(); } - void h2(Y *p) { f(p); } // expected-note {{instantiation of}} + void h2(Y *p) { f(p); } namespace N { template struct G {}; } void h3(N::G *p) { p->~G(); } - void h4(N::G *p) { f(p); } + void h4(N::G *p) { f(p); } // expected-note {{instantiation of}} } namespace TemplateUndeclared { diff --git a/clang/test/SemaCXX/static-assert-cxx17.cpp b/clang/test/SemaCXX/static-assert-cxx17.cpp index 754f4ae5f1d388..41a7b025d0eb75 100644 --- a/clang/test/SemaCXX/static-assert-cxx17.cpp +++ b/clang/test/SemaCXX/static-assert-cxx17.cpp @@ -96,7 +96,7 @@ void foo6() { // expected-error@-1{{static assertion failed due to requirement 'static_cast *>(nullptr)'}} static_assert((const X[]){} == nullptr); // expected-error@-1{{static assertion failed due to requirement '(const X[0]){} == nullptr'}} - static_assert(sizeof(X().template X::~X())>) == 0); + static_assert(sizeof(X().X::~X())>) == 0); // expected-error@-1{{static assertion failed due to requirement 'sizeof(X) == 0'}} \ // expected-note@-1 {{evaluates to '8 == 0'}} static_assert(constexpr_return_false()); diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl index fecf3b76ff7bb6..774309c714f657 100644 --- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl @@ -1,6 +1,5 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s -Resource ResourceDescriptorHeap[5]; typedef vector float3; RWBuffer Buffer; diff --git a/clang/test/SemaTemplate/dependent-base-classes.cpp b/clang/test/SemaTemplate/dependent-base-classes.cpp index 4cb88a5b4070a1..92a37efaa7e73f 100644 --- a/clang/test/SemaTemplate/dependent-base-classes.cpp +++ b/clang/test/SemaTemplate/dependent-base-classes.cpp @@ -1,12 +1,12 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s template -struct X0 : T::template apply { +struct X0 : T::template apply { X0(U u) : T::template apply(u) { } }; template -struct X1 : T::apply { }; // expected-warning{{use 'template' keyword to treat 'apply' as a dependent template name}} +struct X1 : T::apply { }; // expected-error{{use 'template' keyword to treat 'apply' as a dependent template name}} template struct X2 : vector { }; // expected-error{{no template named 'vector'}} @@ -85,7 +85,7 @@ namespace PR6081 { struct A { }; template - class B : public A + class B : public A { public: template< class X > @@ -109,9 +109,9 @@ namespace PR6081 { namespace PR6413 { template class Base_A { }; - + class Base_B { }; - + template class Derived : public virtual Base_A @@ -120,12 +120,12 @@ namespace PR6413 { } namespace PR5812 { - template struct Base { - Base* p; - }; + template struct Base { + Base* p; + }; - template struct Derived: public Base { - typename Derived::Base* p; // meaning Derived::Base + template struct Derived: public Base { + typename Derived::Base* p; // meaning Derived::Base }; Derived di; diff --git a/clang/test/SemaTemplate/dependent-template-recover.cpp b/clang/test/SemaTemplate/dependent-template-recover.cpp index c763989e6dadb2..c7e27e8da25f16 100644 --- a/clang/test/SemaTemplate/dependent-template-recover.cpp +++ b/clang/test/SemaTemplate/dependent-template-recover.cpp @@ -2,15 +2,15 @@ template struct X { void f(T* t) { - t->f0(); // expected-warning{{use 'template' keyword to treat 'f0' as a dependent template name}} - t->f0(); // expected-warning{{use 'template' keyword to treat 'f0' as a dependent template name}} + t->f0(); // expected-error{{use 'template' keyword to treat 'f0' as a dependent template name}} + t->f0(); // expected-error{{use 'template' keyword to treat 'f0' as a dependent template name}} - t->operator+(1); // expected-warning{{use 'template' keyword to treat 'operator +' as a dependent template name}} - t->f1(1); // expected-warning{{use 'template' keyword to treat 'f1' as a dependent template name}} + t->operator+(1); // expected-error{{use 'template' keyword to treat 'operator +' as a dependent template name}} + t->f1(1); // expected-error{{use 'template' keyword to treat 'f1' as a dependent template name}} t->f1<3, int const>(1); // expected-error{{missing 'template' keyword prior to dependent template name 'f1'}} - T::getAs(); // expected-warning{{use 'template' keyword to treat 'getAs' as a dependent template name}} - t->T::getAs(); // expected-warning{{use 'template' keyword to treat 'getAs' as a dependent template name}} + T::getAs(); // expected-error{{use 'template' keyword to treat 'getAs' as a dependent template name}} + t->T::getAs(); // expected-error{{use 'template' keyword to treat 'getAs' as a dependent template name}} (*t).f2(); // expected-error{{missing 'template' keyword prior to dependent template name 'f2'}} (*t).f2<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f2'}} diff --git a/clang/test/SemaTemplate/instantiate-local-class.cpp b/clang/test/SemaTemplate/instantiate-local-class.cpp index 7eee131e28d60c..298233739900f6 100644 --- a/clang/test/SemaTemplate/instantiate-local-class.cpp +++ b/clang/test/SemaTemplate/instantiate-local-class.cpp @@ -512,24 +512,26 @@ namespace LambdaInDefaultMemberInitializer { } #if __cplusplus >= 201703L -namespace GH35052 { -template constexpr int func(F f) { - if constexpr (f(1UL)) { - return 1; +// Reduced from https://github.com/llvm/llvm-project/issues/98526 +// This relies on the deferral instantiation of the local lambda, otherwise we would fail in DeduceReturnType(). +namespace local_recursive_lambda { + +template struct recursive_lambda { + template auto operator()(Args &&...args) const { + return fn(*this, args...); } - return 0; -} + F fn; +}; -int main() { - auto predicate = [](auto v) /*implicit constexpr*/ -> bool { - return v == 1; - }; +template recursive_lambda(F) -> recursive_lambda; - static_assert(predicate(1)); - return func(predicate); +void foo() { + recursive_lambda{[&](auto &self_fn, int) -> int { + return self_fn(0); + }}(0); } -} // namespace GH35052 +} // namespace local_recursive_lambda #endif diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp index 7768d2f03ac5ba..ad73daa8e214c3 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp @@ -115,7 +115,7 @@ namespace CopyCounting { static_assert(f(X()) == 0); template struct Y { void f(); }; - template void g(Y y) { y.template Y::f(); } + template void g(Y y) { y.Y::f(); } void h() { constexpr A a; g(Y{}); } template struct Z { diff --git a/clang/test/SemaTemplate/template-id-expr.cpp b/clang/test/SemaTemplate/template-id-expr.cpp index 760d6c5852403d..dc12823ae307fb 100644 --- a/clang/test/SemaTemplate/template-id-expr.cpp +++ b/clang/test/SemaTemplate/template-id-expr.cpp @@ -19,7 +19,7 @@ template struct X0 { template void f1(); - + template void f2(U) { f1(); @@ -39,9 +39,9 @@ struct Y { template struct X { X(int, int); - void f() { - Y >(X(0, 0)); - Y >(::X(0, 0)); + void f() { + Y >(X(0, 0)); + Y >(::X(0, 0)); } }; @@ -149,11 +149,11 @@ struct Y2 : Y1 { int x; x = Y1::f4(0); - x = Y1::f4(0); // expected-warning {{use 'template'}} expected-error {{assigning to 'int' from incompatible type 'void'}} + x = Y1::f4(0); // expected-error {{use 'template'}} expected-error {{assigning to 'int' from incompatible type 'void'}} x = Y1::template f4(0); // expected-error {{assigning to 'int' from incompatible type 'void'}} expected-error {{a template argument list is expected after a name prefixed by the template keyword}} x = p->f4(0); - x = p->f4(0); // expected-error {{assigning to 'int' from incompatible type 'void'}} expected-warning {{use 'template'}} + x = p->f4(0); // expected-error {{assigning to 'int' from incompatible type 'void'}} expected-error {{use 'template'}} x = p->template f4(0); // expected-error {{assigning to 'int' from incompatible type 'void'}} expected-error {{a template argument list is expected after a name prefixed by the template keyword}} } }; @@ -184,7 +184,7 @@ class E { #if __cplusplus <= 199711L // expected-warning@+2 {{extension}} #endif -template using D = int; // expected-note {{declared here}} +template using D = int; // expected-note {{declared here}} E ed; // expected-note {{instantiation of}} namespace non_functions { diff --git a/clang/test/SemaTemplate/typename-specifier-3.cpp b/clang/test/SemaTemplate/typename-specifier-3.cpp index a9010969322e55..714830f0032d28 100644 --- a/clang/test/SemaTemplate/typename-specifier-3.cpp +++ b/clang/test/SemaTemplate/typename-specifier-3.cpp @@ -46,7 +46,7 @@ namespace PR12884_half_fixed { typedef int arg; }; struct C { - typedef typename B::X x; // expected-warning {{use 'template'}} expected-error {{refers to non-type}} + typedef typename B::X x; // expected-error {{use 'template'}} expected-error {{refers to non-type}} }; }; diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index e078e9bdce027a..f472a67f3bc5b7 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -1840,6 +1840,22 @@ static enum CXChildVisitResult PrintTypeSize(CXCursor cursor, CXCursor p, return CXChildVisit_Recurse; } +static enum CXChildVisitResult PrintBinOps(CXCursor C, CXCursor p, + CXClientData d) { + enum CXCursorKind ck = clang_getCursorKind(C); + enum CX_BinaryOperatorKind bok; + CXString opstr; + if (ck != CXCursor_BinaryOperator && ck != CXCursor_CompoundAssignOperator) + return CXChildVisit_Recurse; + + PrintCursor(C, NULL); + bok = clang_Cursor_getBinaryOpcode(C); + opstr = clang_Cursor_getBinaryOpcodeStr(bok); + printf(" BinOp=%s %d\n", clang_getCString(opstr), bok); + clang_disposeString(opstr); + return CXChildVisit_Recurse; +} + /******************************************************************************/ /* Mangling testing. */ /******************************************************************************/ @@ -5098,6 +5114,8 @@ int cindextest_main(int argc, const char **argv) { else if (argc > 2 && strcmp(argv[1], "-test-print-bitwidth") == 0) return perform_test_load_source(argc - 2, argv + 2, "all", PrintBitWidth, 0); + else if (argc > 2 && strcmp(argv[1], "-test-print-binops") == 0) + return perform_test_load_source(argc - 2, argv + 2, "all", PrintBinOps, 0); else if (argc > 2 && strcmp(argv[1], "-test-print-mangle") == 0) return perform_test_load_tu(argv[2], "all", NULL, PrintMangledName, NULL); else if (argc > 2 && strcmp(argv[1], "-test-print-manglings") == 0) diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index 4e0aa1450563e5..ec93f092713f59 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -98,6 +98,8 @@ struct AssemblerInvocation { LLVM_PREFERRED_TYPE(bool) unsigned RelaxELFRelocations : 1; LLVM_PREFERRED_TYPE(bool) + unsigned SSE2AVX : 1; + LLVM_PREFERRED_TYPE(bool) unsigned Dwarf64 : 1; unsigned DwarfVersion; std::string DwarfDebugFlags; @@ -197,6 +199,7 @@ struct AssemblerInvocation { ShowInst = 0; ShowEncoding = 0; RelaxAll = 0; + SSE2AVX = 0; NoExecStack = 0; FatalWarnings = 0; NoWarn = 0; @@ -288,6 +291,7 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts, } Opts.RelaxELFRelocations = !Args.hasArg(OPT_mrelax_relocations_no); + Opts.SSE2AVX = Args.hasArg(OPT_msse2avx); if (auto *DwarfFormatArg = Args.getLastArg(OPT_gdwarf64, OPT_gdwarf32)) Opts.Dwarf64 = DwarfFormatArg->getOption().matches(OPT_gdwarf64); Opts.DwarfVersion = getLastArgIntValue(Args, OPT_dwarf_version_EQ, 2, Diags); @@ -437,6 +441,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts, MCOptions.MCSaveTempLabels = Opts.SaveTemporaryLabels; MCOptions.Crel = Opts.Crel; MCOptions.X86RelaxRelocations = Opts.RelaxELFRelocations; + MCOptions.X86Sse2Avx = Opts.SSE2AVX; MCOptions.CompressDebugSections = Opts.CompressDebugSections; MCOptions.AsSecureLogFile = Opts.AsSecureLogFile; diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 35312e3d2ae702..fe0be203cb4622 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -5225,6 +5225,11 @@ CXString clang_getCursorSpelling(CXCursor C) { return cxstring::createDup(OS.str()); } + if (C.kind == CXCursor_BinaryOperator || + C.kind == CXCursor_CompoundAssignOperator) { + return clang_Cursor_getBinaryOpcodeStr(clang_Cursor_getBinaryOpcode(C)); + } + const Decl *D = getDeclFromExpr(getCursorExpr(C)); if (D) return getDeclSpelling(D); @@ -8955,6 +8960,35 @@ unsigned clang_Cursor_isExternalSymbol(CXCursor C, CXString *language, return 0; } +enum CX_BinaryOperatorKind clang_Cursor_getBinaryOpcode(CXCursor C) { + if (C.kind != CXCursor_BinaryOperator && + C.kind != CXCursor_CompoundAssignOperator) { + return CX_BO_Invalid; + } + + const Expr *D = getCursorExpr(C); + if (const auto *BinOp = dyn_cast(D)) { + switch (BinOp->getOpcode()) { +#define BINARY_OPERATION(Name, Spelling) \ + case BO_##Name: \ + return CX_BO_##Name; +#include "clang/AST/OperationKinds.def" + } + } + + return CX_BO_Invalid; +} + +CXString clang_Cursor_getBinaryOpcodeStr(enum CX_BinaryOperatorKind Op) { + if (Op > CX_BO_LAST) + return cxstring::createEmpty(); + + return cxstring::createDup( + // BinaryOperator::getOpcodeStr has no case for CX_BO_Invalid, + // so subtract 1 + BinaryOperator::getOpcodeStr(static_cast(Op - 1))); +} + CXSourceRange clang_Cursor_getCommentRange(CXCursor C) { if (!clang_isDeclaration(C.kind)) return clang_getNullRange(); diff --git a/clang/tools/libclang/CXIndexDataConsumer.cpp b/clang/tools/libclang/CXIndexDataConsumer.cpp index c1022263a51280..8d364ed8876a12 100644 --- a/clang/tools/libclang/CXIndexDataConsumer.cpp +++ b/clang/tools/libclang/CXIndexDataConsumer.cpp @@ -861,7 +861,7 @@ bool CXIndexDataConsumer::handleObjCProperty(const ObjCPropertyDecl *D) { } bool CXIndexDataConsumer::handleNamespace(const NamespaceDecl *D) { - DeclInfo DInfo(/*isRedeclaration=*/!D->isOriginalNamespace(), + DeclInfo DInfo(/*isRedeclaration=*/!D->isFirstDecl(), /*isDefinition=*/true, /*isContainer=*/true); return handleDecl(D, D->getLocation(), getCursor(D), DInfo); diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map index 5676198a286d9b..91c329b5765d40 100644 --- a/clang/tools/libclang/libclang.map +++ b/clang/tools/libclang/libclang.map @@ -54,6 +54,8 @@ LLVM_13 { clang_Cursor_Evaluate; clang_Cursor_getArgument; clang_Cursor_getBriefCommentText; + clang_Cursor_getBinaryOpcode; + clang_Cursor_getBinaryOpcodeStr; clang_Cursor_getCXXManglings; clang_Cursor_getCommentRange; clang_Cursor_getMangling; diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp index 5c4d42c6ccdcf8..622de6323ad33a 100644 --- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp @@ -176,13 +176,18 @@ llvm::Error test::checkDataflowWithNoopAnalysis( DataflowAnalysisOptions Options, LangStandard::Kind Std, std::function(QualType)> SyntheticFieldCallback) { llvm::SmallVector ASTBuildArgs = { + "-fsyntax-only", // -fnodelayed-template-parsing is the default everywhere but on Windows. // Set it explicitly so that tests behave the same on Windows as on other // platforms. + "-fno-delayed-template-parsing", // Set -Wno-unused-value because it's often desirable in tests to write // expressions with unused value, and we don't want the output to be // cluttered with warnings about them. - "-fsyntax-only", "-fno-delayed-template-parsing", "-Wno-unused-value", + "-Wno-unused-value", + // Some build environments don't have RTTI enabled by default. + // Enable it explicitly to make sure tests work in all environments. + "-frtti", "-std=" + std::string(LangStandard::getLangStandardForKind(Std).getName())}; AnalysisInputs AI( diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index e743eefa5d4585..39e7001393e5e9 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -1637,6 +1637,49 @@ TEST(TransferTest, StructModeledFieldsWithAccessor) { }); } +TEST(TransferTest, StructModeledFieldsInTypeid) { + // Test that we model fields mentioned inside a `typeid()` expression only if + // that expression is potentially evaluated -- i.e. if the expression inside + // `typeid()` is a glvalue of polymorphic type (see + // `CXXTypeidExpr::isPotentiallyEvaluated()` and [expr.typeid]p3). + std::string Code = R"( + // Definitions needed for `typeid`. + namespace std { + class type_info {}; + class bad_typeid {}; + } // namespace std + + struct NonPolymorphic {}; + + struct Polymorphic { + virtual ~Polymorphic() = default; + }; + + struct S { + NonPolymorphic *NonPoly; + Polymorphic *Poly; + }; + + void target(S &s) { + typeid(*s.NonPoly); + typeid(*s.Poly); + // [[p]] + } + )"; + runDataflow( + Code, + [](const llvm::StringMap> &Results, + ASTContext &ASTCtx) { + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + auto &SLoc = getLocForDecl(ASTCtx, Env, "s"); + std::vector Fields; + for (auto [Field, _] : SLoc.children()) + Fields.push_back(Field); + EXPECT_THAT(Fields, + UnorderedElementsAre(findValueDecl(ASTCtx, "Poly"))); + }); +} + TEST(TransferTest, StructModeledFieldsWithComplicatedInheritance) { std::string Code = R"( struct Base1 { diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 283843ad7ab472..d01ce137b8fcbd 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -16875,7 +16875,7 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) { verifyFormat("int f();", SpaceFuncDef); verifyFormat("void f (int a, T b) {}", SpaceFuncDef); verifyFormat("void __attribute__((asdf)) f (int a, T b) {}", SpaceFuncDef); - verifyFormat("A::A() : a(1) {}", SpaceFuncDef); + verifyFormat("A::A () : a(1) {}", SpaceFuncDef); verifyFormat("void f() __attribute__((asdf));", SpaceFuncDef); verifyFormat("void __attribute__((asdf)) f();", SpaceFuncDef); verifyFormat("#define A(x) x", SpaceFuncDef); @@ -16901,7 +16901,8 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) { verifyFormat("T A::operator()() {}", SpaceFuncDef); verifyFormat("auto lambda = [] () { return 0; };", SpaceFuncDef); verifyFormat("int x = int(y);", SpaceFuncDef); - verifyFormat("M(std::size_t R, std::size_t C) : C(C), data(R) {}", + verifyFormat("void foo::bar () {}", SpaceFuncDef); + verifyFormat("M (std::size_t R, std::size_t C) : C(C), data(R) {}", SpaceFuncDef); FormatStyle SpaceIfMacros = getLLVMStyle(); diff --git a/compiler-rt/CODE_OWNERS.TXT b/compiler-rt/CODE_OWNERS.TXT index 570ab865080600..bd51a1073cc386 100644 --- a/compiler-rt/CODE_OWNERS.TXT +++ b/compiler-rt/CODE_OWNERS.TXT @@ -71,3 +71,7 @@ D: Profile runtime library N: Christopher Apple, David Trevelyan E: cja-private@pm.me, realtime.sanitizer@gmail.com D: Realtime Sanitizer (RTSan) + +N: Alexander Shaposhnikov +E: alexander.v.shaposhnikov@gmail.com +D: Numerical Sanitizer (NSAN) diff --git a/compiler-rt/lib/asan/asan_globals_win.cpp b/compiler-rt/lib/asan/asan_globals_win.cpp index 19af88ab12b40a..9442cc35d5ab74 100644 --- a/compiler-rt/lib/asan/asan_globals_win.cpp +++ b/compiler-rt/lib/asan/asan_globals_win.cpp @@ -17,10 +17,10 @@ namespace __asan { #pragma section(".ASAN$GA", read, write) #pragma section(".ASAN$GZ", read, write) -extern "C" __declspec(allocate(".ASAN$GA")) - ALIGNED(sizeof(__asan_global)) __asan_global __asan_globals_start = {}; -extern "C" __declspec(allocate(".ASAN$GZ")) - ALIGNED(sizeof(__asan_global)) __asan_global __asan_globals_end = {}; +extern "C" alignas(sizeof(__asan_global)) + __declspec(allocate(".ASAN$GA")) __asan_global __asan_globals_start = {}; +extern "C" alignas(sizeof(__asan_global)) + __declspec(allocate(".ASAN$GZ")) __asan_global __asan_globals_end = {}; #pragma comment(linker, "/merge:.ASAN=.data") static void call_on_globals(void (*hook)(__asan_global *, uptr)) { diff --git a/compiler-rt/lib/asan/asan_malloc_linux.cpp b/compiler-rt/lib/asan/asan_malloc_linux.cpp index d426b923c94eda..08a63045c4e652 100644 --- a/compiler-rt/lib/asan/asan_malloc_linux.cpp +++ b/compiler-rt/lib/asan/asan_malloc_linux.cpp @@ -185,11 +185,11 @@ struct MallocDebugL { void* (*valloc)(uptr size); }; -ALIGNED(32) const MallocDebugK asan_malloc_dispatch_k = { +alignas(32) const MallocDebugK asan_malloc_dispatch_k = { WRAP(malloc), WRAP(free), WRAP(calloc), WRAP(realloc), WRAP(memalign), WRAP(malloc_usable_size)}; -ALIGNED(32) const MallocDebugL asan_malloc_dispatch_l = { +alignas(32) const MallocDebugL asan_malloc_dispatch_l = { WRAP(calloc), WRAP(free), WRAP(mallinfo), WRAP(malloc), WRAP(malloc_usable_size), WRAP(memalign), WRAP(posix_memalign), WRAP(pvalloc), WRAP(realloc), diff --git a/compiler-rt/lib/asan/asan_report.cpp b/compiler-rt/lib/asan/asan_report.cpp index c9730dd368cb4c..fd590e401f67fe 100644 --- a/compiler-rt/lib/asan/asan_report.cpp +++ b/compiler-rt/lib/asan/asan_report.cpp @@ -34,8 +34,8 @@ namespace __asan { // -------------------- User-specified callbacks ----------------- {{{1 static void (*error_report_callback)(const char*); using ErrorMessageBuffer = InternalMmapVectorNoCtor; -static ALIGNED( - alignof(ErrorMessageBuffer)) char error_message_buffer_placeholder +alignas( + alignof(ErrorMessageBuffer)) static char error_message_buffer_placeholder [sizeof(ErrorMessageBuffer)]; static ErrorMessageBuffer *error_message_buffer = nullptr; static Mutex error_message_buf_mutex; diff --git a/compiler-rt/lib/asan/asan_suppressions.cpp b/compiler-rt/lib/asan/asan_suppressions.cpp index 6cee6749603954..94289d14d7e780 100644 --- a/compiler-rt/lib/asan/asan_suppressions.cpp +++ b/compiler-rt/lib/asan/asan_suppressions.cpp @@ -20,7 +20,7 @@ namespace __asan { -ALIGNED(64) static char suppression_placeholder[sizeof(SuppressionContext)]; +alignas(64) static char suppression_placeholder[sizeof(SuppressionContext)]; static SuppressionContext *suppression_ctx = nullptr; static const char kInterceptorName[] = "interceptor_name"; static const char kInterceptorViaFunction[] = "interceptor_via_fun"; diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp index 480a423952e8f3..c79c33ab01342f 100644 --- a/compiler-rt/lib/asan/asan_thread.cpp +++ b/compiler-rt/lib/asan/asan_thread.cpp @@ -67,10 +67,10 @@ static void InitThreads() { // thread before all TSD destructors will be called for it. // MIPS requires aligned address - static ALIGNED(alignof( - ThreadRegistry)) char thread_registry_placeholder[sizeof(ThreadRegistry)]; - static ALIGNED(alignof( - ThreadArgRetval)) char thread_data_placeholder[sizeof(ThreadArgRetval)]; + alignas(alignof(ThreadRegistry)) static char + thread_registry_placeholder[sizeof(ThreadRegistry)]; + alignas(alignof(ThreadArgRetval)) static char + thread_data_placeholder[sizeof(ThreadArgRetval)]; asan_thread_registry = new (thread_registry_placeholder) ThreadRegistry(GetAsanThreadContext); diff --git a/compiler-rt/lib/dfsan/dfsan_allocator.h b/compiler-rt/lib/dfsan/dfsan_allocator.h index 3b4171b6314d6d..6ff24fc57a8556 100644 --- a/compiler-rt/lib/dfsan/dfsan_allocator.h +++ b/compiler-rt/lib/dfsan/dfsan_allocator.h @@ -18,7 +18,7 @@ namespace __dfsan { struct DFsanThreadLocalMallocStorage { - ALIGNED(8) uptr allocator_cache[96 * (512 * 8 + 16)]; // Opaque. + alignas(8) uptr allocator_cache[96 * (512 * 8 + 16)]; // Opaque. void CommitBack(); private: diff --git a/compiler-rt/lib/hwasan/hwasan_allocator.cpp b/compiler-rt/lib/hwasan/hwasan_allocator.cpp index 7771127731de88..75dbb336e3445c 100644 --- a/compiler-rt/lib/hwasan/hwasan_allocator.cpp +++ b/compiler-rt/lib/hwasan/hwasan_allocator.cpp @@ -44,7 +44,7 @@ enum { // Initialized in HwasanAllocatorInit, an never changed. -static ALIGNED(16) u8 tail_magic[kShadowAlignment - 1]; +alignas(16) static u8 tail_magic[kShadowAlignment - 1]; static uptr max_malloc_size; bool HwasanChunkView::IsAllocated() const { diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp index e56d19aad26738..794cfb7550d77e 100644 --- a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp +++ b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp @@ -14,15 +14,15 @@ ThreadArgRetval &hwasanThreadArgRetval() { return *thread_data; } void InitThreadList(uptr storage, uptr size) { CHECK_EQ(hwasan_thread_list, nullptr); - static ALIGNED(alignof( - HwasanThreadList)) char thread_list_placeholder[sizeof(HwasanThreadList)]; + alignas(alignof(HwasanThreadList)) static char + thread_list_placeholder[sizeof(HwasanThreadList)]; hwasan_thread_list = new (thread_list_placeholder) HwasanThreadList(storage, size); CHECK_EQ(thread_data, nullptr); - static ALIGNED(alignof( - ThreadArgRetval)) char thread_data_placeholder[sizeof(ThreadArgRetval)]; + alignas(alignof(ThreadArgRetval)) static char + thread_data_placeholder[sizeof(ThreadArgRetval)]; thread_data = new (thread_data_placeholder) ThreadArgRetval(); } diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index 0ecded8b28cdb0..183df6e5ca14bd 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -108,7 +108,7 @@ class LeakSuppressionContext { void PrintMatchedSuppressions(); }; -ALIGNED(64) static char suppression_placeholder[sizeof(LeakSuppressionContext)]; +alignas(64) static char suppression_placeholder[sizeof(LeakSuppressionContext)]; static LeakSuppressionContext *suppression_ctx = nullptr; static const char kSuppressionLeak[] = "leak"; static const char *kSuppressionTypes[] = {kSuppressionLeak}; diff --git a/compiler-rt/lib/lsan/lsan_common_linux.cpp b/compiler-rt/lib/lsan/lsan_common_linux.cpp index 692ad35169e1d8..7a0b2f038be0d3 100644 --- a/compiler-rt/lib/lsan/lsan_common_linux.cpp +++ b/compiler-rt/lib/lsan/lsan_common_linux.cpp @@ -28,7 +28,7 @@ namespace __lsan { static const char kLinkerName[] = "ld"; -static char linker_placeholder[sizeof(LoadedModule)] ALIGNED(64); +alignas(64) static char linker_placeholder[sizeof(LoadedModule)]; static LoadedModule *linker = nullptr; static bool IsLinker(const LoadedModule& module) { diff --git a/compiler-rt/lib/lsan/lsan_thread.cpp b/compiler-rt/lib/lsan/lsan_thread.cpp index 8aa3111eecf7d1..07c7b923623fa9 100644 --- a/compiler-rt/lib/lsan/lsan_thread.cpp +++ b/compiler-rt/lib/lsan/lsan_thread.cpp @@ -35,12 +35,12 @@ static ThreadContextBase *CreateThreadContext(u32 tid) { } void InitializeThreads() { - static ALIGNED(alignof( - ThreadRegistry)) char thread_registry_placeholder[sizeof(ThreadRegistry)]; + alignas(alignof(ThreadRegistry)) static char + thread_registry_placeholder[sizeof(ThreadRegistry)]; thread_registry = new (thread_registry_placeholder) ThreadRegistry(CreateThreadContext); - static ALIGNED(alignof(ThreadArgRetval)) char + alignas(alignof(ThreadArgRetval)) static char thread_arg_retval_placeholder[sizeof(ThreadArgRetval)]; thread_arg_retval = new (thread_arg_retval_placeholder) ThreadArgRetval(); } diff --git a/compiler-rt/lib/memprof/memprof_thread.cpp b/compiler-rt/lib/memprof/memprof_thread.cpp index 9512a87cf98e40..e2bca9bb422f71 100644 --- a/compiler-rt/lib/memprof/memprof_thread.cpp +++ b/compiler-rt/lib/memprof/memprof_thread.cpp @@ -37,7 +37,7 @@ void MemprofThreadContext::OnFinished() { thread = nullptr; } -static ALIGNED(16) char thread_registry_placeholder[sizeof(ThreadRegistry)]; +alignas(16) static char thread_registry_placeholder[sizeof(ThreadRegistry)]; static ThreadRegistry *memprof_thread_registry; static Mutex mu_for_thread_context; diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index b04a72595b93d6..2ee05f43ec5e56 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -56,12 +56,11 @@ THREADLOCAL u64 __msan_retval_tls[kMsanRetvalTlsSize / sizeof(u64)]; SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u32 __msan_retval_origin_tls; -SANITIZER_INTERFACE_ATTRIBUTE -ALIGNED(16) THREADLOCAL u64 __msan_va_arg_tls[kMsanParamTlsSize / sizeof(u64)]; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u64 + __msan_va_arg_tls[kMsanParamTlsSize / sizeof(u64)]; -SANITIZER_INTERFACE_ATTRIBUTE -ALIGNED(16) -THREADLOCAL u32 __msan_va_arg_origin_tls[kMsanParamTlsSize / sizeof(u32)]; +alignas(16) SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u32 + __msan_va_arg_origin_tls[kMsanParamTlsSize / sizeof(u32)]; SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u64 __msan_va_arg_overflow_size_tls; diff --git a/compiler-rt/lib/msan/msan_allocator.h b/compiler-rt/lib/msan/msan_allocator.h index c2a38a401f3b6b..109e24dc509a36 100644 --- a/compiler-rt/lib/msan/msan_allocator.h +++ b/compiler-rt/lib/msan/msan_allocator.h @@ -19,7 +19,7 @@ namespace __msan { struct MsanThreadLocalMallocStorage { // Allocator cache contains atomic_uint64_t which must be 8-byte aligned. - ALIGNED(8) uptr allocator_cache[96 * (512 * 8 + 16)]; // Opaque. + alignas(8) uptr allocator_cache[96 * (512 * 8 + 16)]; // Opaque. void Init(); void CommitBack(); diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp index 789b739b41189a..c540523e0eaed9 100644 --- a/compiler-rt/lib/msan/msan_interceptors.cpp +++ b/compiler-rt/lib/msan/msan_interceptors.cpp @@ -1255,7 +1255,7 @@ struct InterceptorContext { } }; -static ALIGNED(64) char interceptor_placeholder[sizeof(InterceptorContext)]; +alignas(64) static char interceptor_placeholder[sizeof(InterceptorContext)]; InterceptorContext *interceptor_ctx() { return reinterpret_cast(&interceptor_placeholder[0]); } diff --git a/compiler-rt/lib/nsan/CMakeLists.txt b/compiler-rt/lib/nsan/CMakeLists.txt index 1e138d4560c896..acadb09c3332bf 100644 --- a/compiler-rt/lib/nsan/CMakeLists.txt +++ b/compiler-rt/lib/nsan/CMakeLists.txt @@ -23,11 +23,11 @@ set(NSAN_HEADERS nsan_suppressions.h ) -append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC NSAN_CFLAGS) - set(NSAN_DYNAMIC_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS}) set(NSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS}) +append_rtti_flag(OFF NSAN_CFLAGS) + set(NSAN_DYNAMIC_CFLAGS ${NSAN_CFLAGS}) set(NSAN_COMMON_RUNTIME_OBJECT_LIBS diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index 7a5f013579dfba..194093c9679d03 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -513,6 +513,8 @@ int32_t checkFT(const FT value, ShadowFT Shadow, CheckTypeT CheckType, } using ValuePrinter = FTPrinter; using ShadowPrinter = FTPrinter; + Printf("%s", D.Default()); + Printf("\n" "%-12s precision (native): dec: %s hex: %s\n" "%-12s precision (shadow): dec: %s hex: %s\n" @@ -535,7 +537,10 @@ int32_t checkFT(const FT value, ShadowFT Shadow, CheckTypeT CheckType, } if (flags().halt_on_error) { - Printf("Exiting\n"); + if (common_flags()->abort_on_error) + Printf("ABORTING\n"); + else + Printf("Exiting\n"); Die(); } return flags().resume_after_warning ? kResumeFromValue : kContinueWithShadow; @@ -638,8 +643,9 @@ void fCmpFailFT(const FT Lhs, const FT Rhs, ShadowFT LhsShadow, const char *const PredicateName = GetPredicateName(Predicate); Printf("%s", D.Warning()); Printf("WARNING: NumericalStabilitySanitizer: floating-point comparison " - "results depend on precision\n" - "%-12s precision dec (native): %s %s %s (%s)\n" + "results depend on precision\n"); + Printf("%s", D.Default()); + Printf("%-12s precision dec (native): %s %s %s (%s)\n" "%-12s precision dec (shadow): %s %s %s (%s)\n" "%-12s precision hex (native): %s %s %s (%s)\n" "%-12s precision hex (shadow): %s %s %s (%s)\n" @@ -658,7 +664,6 @@ void fCmpFailFT(const FT Lhs, const FT Rhs, ShadowFT LhsShadow, FTInfo::kCppTypeName, ShadowPrinter::hex(LhsShadow).Buffer, PredicateName, ShadowPrinter::hex(RhsShadow).Buffer, GetTruthValueName(ShadowResult), D.End()); - Printf("%s", D.Default()); stack.Print(); if (flags().halt_on_error) { Printf("Exiting\n"); @@ -789,6 +794,8 @@ extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __nsan_init() { InitializeSuppressions(); InitializePlatformEarly(); + DisableCoreDumperIfNecessary(); + if (!MmapFixedNoReserve(TypesAddr(), UnusedAddr() - TypesAddr())) Die(); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cpp index 0513ae36fbc721..1d5058c81acbcd 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator.cpp @@ -25,7 +25,7 @@ namespace __sanitizer { const char *PrimaryAllocatorName = "SizeClassAllocator"; const char *SecondaryAllocatorName = "LargeMmapAllocator"; -static ALIGNED(64) char internal_alloc_placeholder[sizeof(InternalAllocator)]; +alignas(64) static char internal_alloc_placeholder[sizeof(InternalAllocator)]; static atomic_uint8_t internal_allocator_initialized; static StaticSpinMutex internal_alloc_init_mu; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h index 52fe3fe3d15bdc..602b197c42ae34 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h @@ -278,7 +278,7 @@ class SizeClassAllocator32 { static const uptr kRegionSize = 1 << kRegionSizeLog; static const uptr kNumPossibleRegions = kSpaceSize / kRegionSize; - struct ALIGNED(SANITIZER_CACHE_LINE_SIZE) SizeClassInfo { + struct alignas(SANITIZER_CACHE_LINE_SIZE) SizeClassInfo { StaticSpinMutex mutex; IntrusiveList free_list; u32 rand_state; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h index 6e73065d7f53c5..16cdc4ce53b35f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h @@ -667,7 +667,7 @@ class SizeClassAllocator64 { u64 last_released_bytes; }; - struct ALIGNED(SANITIZER_CACHE_LINE_SIZE) RegionInfo { + struct alignas(SANITIZER_CACHE_LINE_SIZE) RegionInfo { Mutex mutex; uptr num_freed_chunks; // Number of elements in the freearray. uptr mapped_free_array; // Bytes mapped for freearray. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h index 0609a11ffdebb0..257c457351dbcd 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h @@ -61,7 +61,7 @@ struct atomic_uint32_t { struct atomic_uint64_t { typedef u64 Type; // On 32-bit platforms u64 is not necessary aligned on 8 bytes. - volatile ALIGNED(8) Type val_dont_use; + alignas(8) volatile Type val_dont_use; }; struct atomic_uintptr_t { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 1df61e79f7d84a..a6066a6226e1b3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -1251,6 +1251,7 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3, void *ctx; COMMON_INTERCEPTOR_ENTER(ctx, prctl, option, arg2, arg3, arg4, arg5); static const int PR_SET_NAME = 15; + static const int PR_GET_NAME = 16; static const int PR_SET_VMA = 0x53564d41; static const int PR_SCHED_CORE = 62; static const int PR_SCHED_CORE_GET = 0; @@ -1264,7 +1265,11 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3, internal_strncpy(buff, (char *)arg2, 15); buff[15] = 0; COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, buff); - } else if (res != -1 && option == PR_SCHED_CORE && arg2 == PR_SCHED_CORE_GET) { + } else if (res == 0 && option == PR_GET_NAME) { + char *name = (char *)arg2; + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, name, internal_strlen(name) + 1); + } else if (res != -1 && option == PR_SCHED_CORE && + arg2 == PR_SCHED_CORE_GET) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64*)(arg5), sizeof(u64)); } return res; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index f7f5698a5f32d5..7935c88204a054 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -574,7 +574,9 @@ int TgKill(pid_t pid, tid_t tid, int sig) { return internal_syscall(SYSCALL(thr_kill2), pid, tid, sig); # elif SANITIZER_SOLARIS (void)pid; - return thr_kill(tid, sig); + errno = thr_kill(tid, sig); + // TgKill is expected to return -1 on error, not an errno. + return errno != 0 ? -1 : 0; # endif } # endif diff --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h b/compiler-rt/lib/tsan/rtl/tsan_defs.h index 1ffa3d6aec40bd..270d441dc90b7b 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_defs.h +++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h @@ -30,7 +30,7 @@ # define __MM_MALLOC_H # include # include -# define VECTOR_ALIGNED ALIGNED(16) +# define VECTOR_ALIGNED alignas(16) typedef __m128i m128; #else # define VECTOR_ALIGNED diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp index 034ae3d322b56b..9cab2a37271288 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp @@ -208,7 +208,7 @@ struct AtExitCtx { struct InterceptorContext { // The object is 64-byte aligned, because we want hot data to be located // in a single cache line if possible (it's accessed in every interceptor). - ALIGNED(64) LibIgnore libignore; + alignas(64) LibIgnore libignore; __sanitizer_sigaction sigactions[kSigCount]; #if !SANITIZER_APPLE && !SANITIZER_NETBSD unsigned finalize_key; @@ -220,7 +220,7 @@ struct InterceptorContext { InterceptorContext() : libignore(LINKER_INITIALIZED), atexit_mu(MutexTypeAtExit), AtExitStack() {} }; -static ALIGNED(64) char interceptor_placeholder[sizeof(InterceptorContext)]; +alignas(64) static char interceptor_placeholder[sizeof(InterceptorContext)]; InterceptorContext *interceptor_ctx() { return reinterpret_cast(&interceptor_placeholder[0]); } diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp index 5154662034c56d..befd6a369026d8 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp @@ -76,7 +76,7 @@ struct DynamicAnnContext { }; static DynamicAnnContext *dyn_ann_ctx; -static char dyn_ann_ctx_placeholder[sizeof(DynamicAnnContext)] ALIGNED(64); +alignas(64) static char dyn_ann_ctx_placeholder[sizeof(DynamicAnnContext)]; static void AddExpectRace(ExpectRace *list, char *f, int l, uptr addr, uptr size, char *desc) { diff --git a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp index e129e9af272f5f..0705365d77427d 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp @@ -54,7 +54,7 @@ struct MapUnmapCallback { } }; -static char allocator_placeholder[sizeof(Allocator)] ALIGNED(64); +alignas(64) static char allocator_placeholder[sizeof(Allocator)]; Allocator *allocator() { return reinterpret_cast(&allocator_placeholder); } @@ -75,7 +75,7 @@ struct GlobalProc { internal_alloc_mtx(MutexTypeInternalAlloc) {} }; -static char global_proc_placeholder[sizeof(GlobalProc)] ALIGNED(64); +alignas(64) static char global_proc_placeholder[sizeof(GlobalProc)]; GlobalProc *global_proc() { return reinterpret_cast(&global_proc_placeholder); } diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp index 07d83e1a9a9fff..c8a66e60a69f16 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp @@ -46,7 +46,7 @@ namespace __tsan { #if !SANITIZER_GO -static char main_thread_state[sizeof(ThreadState)] ALIGNED( +static char main_thread_state[sizeof(ThreadState)] alignas( SANITIZER_CACHE_LINE_SIZE); static ThreadState *dead_thread_state; static pthread_key_t thread_state_key; diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp index e5ebb65754b327..bf29aa316f6809 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp @@ -48,11 +48,10 @@ int (*on_finalize)(int); #endif #if !SANITIZER_GO && !SANITIZER_APPLE -__attribute__((tls_model("initial-exec"))) -THREADLOCAL char cur_thread_placeholder[sizeof(ThreadState)] ALIGNED( - SANITIZER_CACHE_LINE_SIZE); +alignas(SANITIZER_CACHE_LINE_SIZE) THREADLOCAL __attribute__((tls_model( + "initial-exec"))) char cur_thread_placeholder[sizeof(ThreadState)]; #endif -static char ctx_placeholder[sizeof(Context)] ALIGNED(SANITIZER_CACHE_LINE_SIZE); +alignas(SANITIZER_CACHE_LINE_SIZE) static char ctx_placeholder[sizeof(Context)]; Context *ctx; // Can be overriden by a front-end. diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h index de4ea0bb5f4877..f48be8e0a4fe08 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h @@ -136,7 +136,7 @@ struct TidEpoch { Epoch epoch; }; -struct TidSlot { +struct alignas(SANITIZER_CACHE_LINE_SIZE) TidSlot { Mutex mtx; Sid sid; atomic_uint32_t raw_epoch; @@ -153,10 +153,10 @@ struct TidSlot { } TidSlot(); -} ALIGNED(SANITIZER_CACHE_LINE_SIZE); +}; // This struct is stored in TLS. -struct ThreadState { +struct alignas(SANITIZER_CACHE_LINE_SIZE) ThreadState { FastState fast_state; int ignore_sync; #if !SANITIZER_GO @@ -234,7 +234,7 @@ struct ThreadState { const ReportDesc *current_report; explicit ThreadState(Tid tid); -} ALIGNED(SANITIZER_CACHE_LINE_SIZE); +}; #if !SANITIZER_GO #if SANITIZER_APPLE || SANITIZER_ANDROID diff --git a/compiler-rt/lib/tsan/rtl/tsan_suppressions.cpp b/compiler-rt/lib/tsan/rtl/tsan_suppressions.cpp index 70642124990d7b..0559df06e7e2ec 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_suppressions.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_suppressions.cpp @@ -42,7 +42,7 @@ const char *__tsan_default_suppressions() { namespace __tsan { -ALIGNED(64) static char suppression_placeholder[sizeof(SuppressionContext)]; +alignas(64) static char suppression_placeholder[sizeof(SuppressionContext)]; static SuppressionContext *suppression_ctx = nullptr; static const char *kSuppressionTypes[] = { kSuppressionRace, kSuppressionRaceTop, kSuppressionMutex, diff --git a/compiler-rt/lib/tsan/rtl/tsan_vector_clock.h b/compiler-rt/lib/tsan/rtl/tsan_vector_clock.h index 63b206302190d1..51d98113d8e78a 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_vector_clock.h +++ b/compiler-rt/lib/tsan/rtl/tsan_vector_clock.h @@ -34,7 +34,7 @@ class VectorClock { VectorClock& operator=(const VectorClock& other); private: - Epoch clk_[kThreadSlotCount] VECTOR_ALIGNED; + VECTOR_ALIGNED Epoch clk_[kThreadSlotCount]; }; ALWAYS_INLINE Epoch VectorClock::Get(Sid sid) const { diff --git a/compiler-rt/lib/ubsan/ubsan_diag.cpp b/compiler-rt/lib/ubsan/ubsan_diag.cpp index 67e884e4916c50..1625dfe89eb11f 100644 --- a/compiler-rt/lib/ubsan/ubsan_diag.cpp +++ b/compiler-rt/lib/ubsan/ubsan_diag.cpp @@ -402,7 +402,7 @@ ScopedReport::~ScopedReport() { Die(); } -ALIGNED(64) static char suppression_placeholder[sizeof(SuppressionContext)]; +alignas(64) static char suppression_placeholder[sizeof(SuppressionContext)]; static SuppressionContext *suppression_ctx = nullptr; static const char kVptrCheck[] = "vptr_check"; static const char *kSuppressionTypes[] = { diff --git a/compiler-rt/test/asan/TestCases/Darwin/init_for_dlopen.cpp b/compiler-rt/test/asan/TestCases/Darwin/init_for_dlopen.cpp index 0107f30dd7d9bd..3bf8e99703a08a 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/init_for_dlopen.cpp +++ b/compiler-rt/test/asan/TestCases/Darwin/init_for_dlopen.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -g -O0 %s -o %t // Check that trying to dlopen() the ASan dylib fails. -// We explictly set `abort_on_error=0` because +// We explicitly set `abort_on_error=0` because // - By default the lit config sets this but we don't want this // test to implicitly depend on this. // - It avoids requiring `--crash` to be passed to `not`. diff --git a/compiler-rt/test/msan/Linux/prctl.cpp b/compiler-rt/test/msan/Linux/prctl.cpp new file mode 100644 index 00000000000000..1af4000de8a0ce --- /dev/null +++ b/compiler-rt/test/msan/Linux/prctl.cpp @@ -0,0 +1,23 @@ +// RUN: %clangxx_msan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s + +#include +#include + +int main(void) { + prctl(PR_SET_NAME, "tname"); + char name[16]; + prctl(PR_GET_NAME, name); + + if (name[0] == 'A') { + return 0; + } + if (name[5] != '\0') { + return 0; + } + if (name[6] != '\0') { + return 0; + } + // CHECK: SUMMARY: MemorySanitizer: use-of-uninitialized-value {{.*prctl.cpp}}:[[@LINE-3]] + + return 0; +} diff --git a/compiler-rt/test/profile/instrprof-gc-sections.c b/compiler-rt/test/profile/instrprof-gc-sections.c index 8b84c0a2421804..6541ee7387e30a 100644 --- a/compiler-rt/test/profile/instrprof-gc-sections.c +++ b/compiler-rt/test/profile/instrprof-gc-sections.c @@ -2,6 +2,7 @@ // FIXME: Investigate and fix. // XFAIL: powerpc64-target-arch +// UNSUPPORTED: powerpc64le-target-arch // RUN: rm -rf %t.profraw // RUN: %clang_profgen=%t.profraw -fuse-ld=lld -fcoverage-mapping -mllvm -enable-name-compression=false -DCODE=1 -ffunction-sections -fdata-sections -Wl,--gc-sections -o %t %s diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp index 581739500e7a9e..d5d81280e0b44c 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -60,5 +61,14 @@ int main() { } munmap(p, 128); + res = prctl(PR_SET_NAME, "tname"); + if (res == 0) { + char name[16]; + res = prctl(PR_GET_NAME, name); + if (res == 0) { + assert(!strcmp(name, "tname")); + } + } + return 0; } diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 1fdf22daf3688f..87716731ead855 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1001,3 +1001,62 @@ PROGRAM example_getcwd PRINT *, status END PROGRAM ``` + +### Non-standard Intrinsics: RENAME +`RENAME(OLD, NEW[, STATUS])` renames/moves a file on the filesystem. + +This intrinsic is provided in both subroutine and function form; however, only one form can be used in any given program unit. + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** Subroutine, function +- **Syntax:** `CALL RENAME(SRC, DST[, STATUS])` +- **Arguments:** +- **Return value** status code (0: success, non-zero for errors) + +| Argument | Description | +|----------|-----------------------------------| +| `SRC` | Source path | +| `DST` | Destination path | +| `STATUS` | Status code (for subroutine form) | + +The status code returned by both the subroutine and function form corresponds to the value of `errno` if the invocation of `rename(2)` was not successful. + +#### Example + +Function form: +``` +program rename_func + implicit none + integer :: status + status = rename('src', 'dst') + print *, 'status:', status + status = rename('dst', 'src') + print *, 'status:', status +end program rename_func +``` + +Subroutine form: +``` +program rename_proc + implicit none + integer :: status + call rename('src', 'dst', status) + print *, 'status:', status + call rename('dst', 'src') +end program rename_proc +``` + +### Non-standard Intrinsics: SECOND +This intrinsic is an alias for `CPU_TIME`: supporting both a subroutine and a +function form. + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** Subroutine, function +- **Syntax:** `CALL SECOND(TIME)` or `TIME = SECOND()` +- **Arguments:** `TIME` - a REAL value into which the elapsed CPU time in + seconds is written +- **RETURN value:** same as TIME argument diff --git a/flang/include/flang/Frontend/TargetOptions.h b/flang/include/flang/Frontend/TargetOptions.h index ef5d270a2185da..fa72c77a028a1c 100644 --- a/flang/include/flang/Frontend/TargetOptions.h +++ b/flang/include/flang/Frontend/TargetOptions.h @@ -32,6 +32,9 @@ class TargetOptions { /// If given, the name of the target CPU to generate code for. std::string cpu; + /// If given, the name of the target CPU to tune code for. + std::string cpuToTuneFor; + /// The list of target specific features to enable or disable, as written on /// the command line. std::vector featuresAsWritten; diff --git a/flang/include/flang/Lower/Bridge.h b/flang/include/flang/Lower/Bridge.h index 52110b861b6801..4379ed512cdf0a 100644 --- a/flang/include/flang/Lower/Bridge.h +++ b/flang/include/flang/Lower/Bridge.h @@ -65,11 +65,11 @@ class LoweringBridge { const Fortran::lower::LoweringOptions &loweringOptions, const std::vector &envDefaults, const Fortran::common::LanguageFeatureControl &languageFeatures, - const llvm::TargetMachine &targetMachine) { + const llvm::TargetMachine &targetMachine, llvm::StringRef tuneCPU) { return LoweringBridge(ctx, semanticsContext, defaultKinds, intrinsics, targetCharacteristics, allCooked, triple, kindMap, loweringOptions, envDefaults, languageFeatures, - targetMachine); + targetMachine, tuneCPU); } //===--------------------------------------------------------------------===// @@ -148,7 +148,7 @@ class LoweringBridge { const Fortran::lower::LoweringOptions &loweringOptions, const std::vector &envDefaults, const Fortran::common::LanguageFeatureControl &languageFeatures, - const llvm::TargetMachine &targetMachine); + const llvm::TargetMachine &targetMachine, const llvm::StringRef tuneCPU); LoweringBridge() = delete; LoweringBridge(const LoweringBridge &) = delete; diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 53168a920e3c6b..80f077ad133f38 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -347,6 +347,8 @@ struct IntrinsicLibrary { fir::ExtendedValue genReduce(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genReduceDim(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genRename(std::optional, + mlir::ArrayRef); fir::ExtendedValue genRepeat(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genReshape(mlir::Type, llvm::ArrayRef); mlir::Value genRRSpacing(mlir::Type resultType, @@ -355,6 +357,8 @@ struct IntrinsicLibrary { llvm::ArrayRef); mlir::Value genScale(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genScan(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genSecond(std::optional, + mlir::ArrayRef); fir::ExtendedValue genSelectedCharKind(mlir::Type, llvm::ArrayRef); mlir::Value genSelectedIntKind(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 7497a4bc35646f..240de5a899d37b 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -53,6 +53,10 @@ void genRandomNumber(fir::FirOpBuilder &, mlir::Location, mlir::Value harvest); void genRandomSeed(fir::FirOpBuilder &, mlir::Location, mlir::Value size, mlir::Value put, mlir::Value get); +/// generate rename runtime call +void genRename(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value path1, mlir::Value path2, mlir::Value status); + /// generate runtime call to transfer intrinsic with no size argument void genTransfer(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox, mlir::Value sourceBox, diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td index 9a4d327b33bad6..989e3943882a19 100644 --- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td +++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td @@ -31,6 +31,8 @@ def FIRToLLVMLowering : Pass<"fir-to-llvm-ir", "mlir::ModuleOp"> { "Override module's data layout.">, Option<"forcedTargetCPU", "target-cpu", "std::string", /*default=*/"", "Override module's target CPU.">, + Option<"forcedTuneCPU", "tune-cpu", "std::string", /*default=*/"", + "Override module's tune CPU.">, Option<"forcedTargetFeatures", "target-features", "std::string", /*default=*/"", "Override module's target features.">, Option<"applyTBAA", "apply-tbaa", "bool", /*default=*/"false", @@ -68,6 +70,8 @@ def TargetRewritePass : Pass<"target-rewrite", "mlir::ModuleOp"> { "Override module's target triple.">, Option<"forcedTargetCPU", "target-cpu", "std::string", /*default=*/"", "Override module's target CPU.">, + Option<"forcedTuneCPU", "tune-cpu", "std::string", /*default=*/"", + "Override module's tune CPU.">, Option<"forcedTargetFeatures", "target-features", "std::string", /*default=*/"", "Override module's target features.">, Option<"noCharacterConversion", "no-character-conversion", diff --git a/flang/include/flang/Optimizer/CodeGen/Target.h b/flang/include/flang/Optimizer/CodeGen/Target.h index 3cf6a74a9adb7a..a7161152a5c323 100644 --- a/flang/include/flang/Optimizer/CodeGen/Target.h +++ b/flang/include/flang/Optimizer/CodeGen/Target.h @@ -76,6 +76,11 @@ class CodeGenSpecifics { llvm::StringRef targetCPU, mlir::LLVM::TargetFeaturesAttr targetFeatures, const mlir::DataLayout &dl); + static std::unique_ptr + get(mlir::MLIRContext *ctx, llvm::Triple &&trp, KindMapping &&kindMap, + llvm::StringRef targetCPU, mlir::LLVM::TargetFeaturesAttr targetFeatures, + const mlir::DataLayout &dl, llvm::StringRef tuneCPU); + static TypeAndAttr getTypeAndAttr(mlir::Type t) { return TypeAndAttr{t, {}}; } CodeGenSpecifics(mlir::MLIRContext *ctx, llvm::Triple &&trp, @@ -83,7 +88,17 @@ class CodeGenSpecifics { mlir::LLVM::TargetFeaturesAttr targetFeatures, const mlir::DataLayout &dl) : context{*ctx}, triple{std::move(trp)}, kindMap{std::move(kindMap)}, - targetCPU{targetCPU}, targetFeatures{targetFeatures}, dataLayout{&dl} {} + targetCPU{targetCPU}, targetFeatures{targetFeatures}, dataLayout{&dl}, + tuneCPU{""} {} + + CodeGenSpecifics(mlir::MLIRContext *ctx, llvm::Triple &&trp, + KindMapping &&kindMap, llvm::StringRef targetCPU, + mlir::LLVM::TargetFeaturesAttr targetFeatures, + const mlir::DataLayout &dl, llvm::StringRef tuneCPU) + : context{*ctx}, triple{std::move(trp)}, kindMap{std::move(kindMap)}, + targetCPU{targetCPU}, targetFeatures{targetFeatures}, dataLayout{&dl}, + tuneCPU{tuneCPU} {} + CodeGenSpecifics() = delete; virtual ~CodeGenSpecifics() {} @@ -165,6 +180,7 @@ class CodeGenSpecifics { virtual unsigned char getCIntTypeWidth() const = 0; llvm::StringRef getTargetCPU() const { return targetCPU; } + llvm::StringRef getTuneCPU() const { return tuneCPU; } mlir::LLVM::TargetFeaturesAttr getTargetFeatures() const { return targetFeatures; @@ -182,6 +198,7 @@ class CodeGenSpecifics { llvm::StringRef targetCPU; mlir::LLVM::TargetFeaturesAttr targetFeatures; const mlir::DataLayout *dataLayout = nullptr; + llvm::StringRef tuneCPU; }; } // namespace fir diff --git a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h index 059a10ce2fe511..bd31aa0782493c 100644 --- a/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h +++ b/flang/include/flang/Optimizer/Dialect/Support/FIRContext.h @@ -58,6 +58,13 @@ void setTargetCPU(mlir::ModuleOp mod, llvm::StringRef cpu); /// Get the target CPU string from the Module or return a null reference. llvm::StringRef getTargetCPU(mlir::ModuleOp mod); +/// Set the tune CPU for the module. `cpu` must not be deallocated while +/// module `mod` is still live. +void setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu); + +/// Get the tune CPU string from the Module or return a null reference. +llvm::StringRef getTuneCPU(mlir::ModuleOp mod); + /// Set the target features for the module. void setTargetFeatures(mlir::ModuleOp mod, llvm::StringRef features); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index b3ed9acad36df4..786083f95e15c0 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -411,7 +411,10 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> { Option<"unsafeFPMath", "unsafe-fp-math", "bool", /*default=*/"false", "Set the unsafe-fp-math attribute on functions in the module.">, - ]; + Option<"tuneCPU", "tune-cpu", + "llvm::StringRef", /*default=*/"llvm::StringRef{}", + "Set the tune-cpu attribute on functions in the module.">, +]; } def AssumedRankOpConversion : Pass<"fir-assumed-rank-op", "mlir::ModuleOp"> { diff --git a/flang/include/flang/Runtime/misc-intrinsic.h b/flang/include/flang/Runtime/misc-intrinsic.h index 73cc9e2023d979..3fb3aaed49c0fb 100644 --- a/flang/include/flang/Runtime/misc-intrinsic.h +++ b/flang/include/flang/Runtime/misc-intrinsic.h @@ -19,6 +19,8 @@ namespace Fortran::runtime { class Descriptor; extern "C" { +void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2, + const Descriptor *status, const char *sourceFile, int line); void RTDECL(Transfer)(Descriptor &result, const Descriptor &source, const Descriptor &mold, const char *sourceFile, int line); void RTDECL(TransferSize)(Descriptor &result, const Descriptor &source, diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 7f2910c5cfd3c3..7df50449494631 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -9,6 +9,7 @@ /// This file defines some shared command-line options that can be used when /// debugging the test tools. This file must be included into the tool. +#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Pass/PassManager.h" @@ -223,6 +224,10 @@ inline void addFIRToLLVMPass( options.forceUnifiedTBAATree = useOldAliasTags; addPassConditionally(pm, disableFirToLlvmIr, [&]() { return fir::createFIRToLLVMPass(options); }); + // The dialect conversion framework may leave dead unrealized_conversion_cast + // ops behind, so run reconcile-unrealized-casts to clean them up. + addPassConditionally(pm, disableFirToLlvmIr, + [&]() { return mlir::createReconcileUnrealizedCastsPass(); }); } inline void addLLVMDialectToLLVMPass( diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index de6e79783a3130..039dbcb82f7452 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -795,6 +795,10 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"identity", SameType, Rank::scalar, Optionality::optional}, {"ordered", AnyLogical, Rank::scalar, Optionality::optional}}, SameType, Rank::scalar, IntrinsicClass::transformationalFunction}, + {"rename", + {{"path1", DefaultChar, Rank::scalar}, + {"path2", DefaultChar, Rank::scalar}}, + DefaultInt, Rank::scalar}, {"repeat", {{"string", SameCharNoLen, Rank::scalar}, {"ncopies", AnyInt, Rank::scalar}}, @@ -818,6 +822,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"back", AnyLogical, Rank::elemental, Optionality::optional}, DefaultingKIND}, KINDInt}, + {"second", {}, DefaultReal, Rank::scalar}, {"selected_char_kind", {{"name", DefaultChar, Rank::scalar}}, DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction}, {"selected_int_kind", {{"r", AnyInt, Rank::scalar}}, DefaultInt, @@ -1464,6 +1469,14 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"get", DefaultInt, Rank::vector, Optionality::optional, common::Intent::Out}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"rename", + {{"path1", DefaultChar, Rank::scalar}, + {"path2", DefaultChar, Rank::scalar}, + {"status", DefaultInt, Rank::scalar, Optionality::optional, + common::Intent::Out}}, + {}, Rank::scalar, IntrinsicClass::impureSubroutine}, + {"second", {{"time", DefaultReal, Rank::scalar}}, {}, Rank::scalar, + IntrinsicClass::impureSubroutine}, {"system", {{"command", DefaultChar, Rank::scalar}, {"exitstat", DefaultInt, Rank::scalar, Optionality::optional, @@ -2612,7 +2625,8 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic( const std::string &name) const { // Collection for some intrinsics with function and subroutine form, // in order to pass the semantic check. - static const std::string dualIntrinsic[]{{"etime"}, {"getcwd"}}; + static const std::string dualIntrinsic[]{ + {"etime"s}, {"getcwd"s}, {"rename"s}, {"second"s}}; return std::find_if(std::begin(dualIntrinsic), std::end(dualIntrinsic), [&name](const std::string &dualName) { diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 37b50702c7063b..8c892d9d032e1d 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -431,6 +431,10 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { args.getLastArg(clang::driver::options::OPT_target_cpu)) opts.cpu = a->getValue(); + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_tune_cpu)) + opts.cpuToTuneFor = a->getValue(); + for (const llvm::opt::Arg *currentArg : args.filtered(clang::driver::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index a85ecd1ac71b3e..5c86bd947ce73f 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -297,7 +297,8 @@ bool CodeGenAction::beginSourceFileAction() { ci.getParsing().allCooked(), ci.getInvocation().getTargetOpts().triple, kindMap, ci.getInvocation().getLoweringOpts(), ci.getInvocation().getFrontendOpts().envDefaults, - ci.getInvocation().getFrontendOpts().features, targetMachine); + ci.getInvocation().getFrontendOpts().features, targetMachine, + ci.getInvocation().getTargetOpts().cpuToTuneFor); // Fetch module from lb, so we can set mlirModule = std::make_unique(lb.getModule()); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 267a3557ab8c4a..77e038dac13ff6 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -6025,7 +6025,7 @@ Fortran::lower::LoweringBridge::LoweringBridge( const Fortran::lower::LoweringOptions &loweringOptions, const std::vector &envDefaults, const Fortran::common::LanguageFeatureControl &languageFeatures, - const llvm::TargetMachine &targetMachine) + const llvm::TargetMachine &targetMachine, const llvm::StringRef tuneCPU) : semanticsContext{semanticsContext}, defaultKinds{defaultKinds}, intrinsics{intrinsics}, targetCharacteristics{targetCharacteristics}, cooked{&cooked}, context{context}, kindMap{kindMap}, @@ -6082,6 +6082,7 @@ Fortran::lower::LoweringBridge::LoweringBridge( fir::setTargetTriple(*module.get(), triple); fir::setKindMapping(*module.get(), kindMap); fir::setTargetCPU(*module.get(), targetMachine.getTargetCPU()); + fir::setTuneCPU(*module.get(), tuneCPU); fir::setTargetFeatures(*module.get(), targetMachine.getTargetFeatureString()); fir::support::setMLIRDataLayout(*module.get(), targetMachine.createDataLayout()); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index fe7605d8ce4ba7..e12e21bb00e15c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -550,6 +550,12 @@ static constexpr IntrinsicHandler handlers[]{ {"identity", asAddr, handleDynamicOptional}, {"ordered", asValue, handleDynamicOptional}}}, /*isElemental=*/false}, + {"rename", + &I::genRename, + {{{"path1", asBox}, + {"path2", asBox}, + {"status", asBox, handleDynamicOptional}}}, + /*isElemental=*/false}, {"repeat", &I::genRepeat, {{{"string", asAddr}, {"ncopies", asValue}}}, @@ -577,6 +583,10 @@ static constexpr IntrinsicHandler handlers[]{ {"back", asValue, handleDynamicOptional}, {"kind", asValue}}}, /*isElemental=*/true}, + {"second", + &I::genSecond, + {{{"time", asAddr}}}, + /*isElemental=*/false}, {"selected_char_kind", &I::genSelectedCharKind, {{{"name", asAddr}}}, @@ -5917,6 +5927,37 @@ IntrinsicLibrary::genReduce(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "REDUCE"); } +// RENAME +fir::ExtendedValue +IntrinsicLibrary::genRename(std::optional resultType, + mlir::ArrayRef args) { + assert((args.size() == 3 && !resultType.has_value()) || + (args.size() == 2 && resultType.has_value())); + + mlir::Value path1 = fir::getBase(args[0]); + mlir::Value path2 = fir::getBase(args[1]); + if (!path1 || !path2) + fir::emitFatalError(loc, "Expected at least two dummy arguments"); + + if (resultType.has_value()) { + // code-gen for the function form of RENAME + auto statusAddr = builder.createTemporary(loc, *resultType); + auto statusBox = builder.createBox(loc, statusAddr); + fir::runtime::genRename(builder, loc, path1, path2, statusBox); + return builder.create(loc, statusAddr); + } else { + // code-gen for the procedure form of RENAME + mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType()); + auto status = args[2]; + mlir::Value statusBox = + isStaticallyPresent(status) + ? fir::getBase(status) + : builder.create(loc, boxNoneTy).getResult(); + fir::runtime::genRename(builder, loc, path1, path2, statusBox); + return {}; + } +} + // REPEAT fir::ExtendedValue IntrinsicLibrary::genRepeat(mlir::Type resultType, @@ -6103,6 +6144,27 @@ IntrinsicLibrary::genScan(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "SCAN"); } +// SECOND +fir::ExtendedValue +IntrinsicLibrary::genSecond(std::optional resultType, + mlir::ArrayRef args) { + assert(args.size() == 1 && !resultType || args.empty() && resultType); + + fir::ExtendedValue result; + + if (resultType) + result = builder.createTemporary(loc, *resultType); + else + result = args[0]; + + llvm::SmallVector subroutineArgs(1, result); + genCpuTime(subroutineArgs); + + if (resultType) + return result; + return {}; +} + // SELECTED_CHAR_KIND fir::ExtendedValue IntrinsicLibrary::genSelectedCharKind(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index 3f36d639861b12..aff3cadc3c300d 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -199,6 +199,24 @@ void fir::runtime::genRandomSeed(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, func, args); } +/// generate rename runtime call +void fir::runtime::genRename(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value path1, mlir::Value path2, + mlir::Value status) { + auto runtimeFunc = + fir::runtime::getRuntimeFunc(loc, builder); + mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType(); + + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(4)); + + llvm::SmallVector args = + fir::runtime::createArguments(builder, loc, runtimeFuncTy, path1, path2, + status, sourceFile, sourceLine); + builder.create(loc, runtimeFunc, args); +} + /// generate runtime call to transfer intrinsic with no size argument void fir::runtime::genTransfer(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox, mlir::Value sourceBox, diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 7483acfcd1ca7e..f9ea92a843b23d 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -35,7 +35,6 @@ #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MathToLibm/MathToLibm.h" #include "mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h" -#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/DLTI/DLTI.h" @@ -2042,13 +2041,13 @@ struct ExtractValueOpConversion /// InsertValue is the generalized instruction for the composition of new /// aggregate type values. struct InsertValueOpConversion - : public fir::FIROpAndTypeConversion, + : public mlir::OpConversionPattern, public ValueOpCommon { - using FIROpAndTypeConversion::FIROpAndTypeConversion; + using OpConversionPattern::OpConversionPattern; llvm::LogicalResult - doRewrite(fir::InsertValueOp insertVal, mlir::Type ty, OpAdaptor adaptor, - mlir::ConversionPatternRewriter &rewriter) const override { + matchAndRewrite(fir::InsertValueOp insertVal, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { mlir::ValueRange operands = adaptor.getOperands(); auto indices = collectIndices(rewriter, insertVal.getCoor()); toRowMajor(indices, operands[0].getType()); @@ -2669,8 +2668,9 @@ struct TypeDescOpConversion : public fir::FIROpConversion { }; /// Lower `fir.has_value` operation to `llvm.return` operation. -struct HasValueOpConversion : public fir::FIROpConversion { - using FIROpConversion::FIROpConversion; +struct HasValueOpConversion + : public mlir::OpConversionPattern { + using OpConversionPattern::OpConversionPattern; llvm::LogicalResult matchAndRewrite(fir::HasValueOp op, OpAdaptor adaptor, @@ -3515,29 +3515,6 @@ struct MustBeDeadConversion : public fir::FIROpConversion { } }; -struct UnrealizedConversionCastOpConversion - : public fir::FIROpConversion { - using FIROpConversion::FIROpConversion; - - llvm::LogicalResult - matchAndRewrite(mlir::UnrealizedConversionCastOp op, OpAdaptor adaptor, - mlir::ConversionPatternRewriter &rewriter) const override { - assert(op.getOutputs().getTypes().size() == 1 && "expect a single type"); - mlir::Type convertedType = convertType(op.getOutputs().getTypes()[0]); - if (convertedType == adaptor.getInputs().getTypes()[0]) { - rewriter.replaceOp(op, adaptor.getInputs()); - return mlir::success(); - } - - convertedType = adaptor.getInputs().getTypes()[0]; - if (convertedType == op.getOutputs().getType()[0]) { - rewriter.replaceOp(op, adaptor.getInputs()); - return mlir::success(); - } - return mlir::failure(); - } -}; - struct ShapeOpConversion : public MustBeDeadConversion { using MustBeDeadConversion::MustBeDeadConversion; }; @@ -3618,6 +3595,9 @@ class FIRToLLVMLowering if (!forcedTargetCPU.empty()) fir::setTargetCPU(mod, forcedTargetCPU); + if (!forcedTuneCPU.empty()) + fir::setTuneCPU(mod, forcedTuneCPU); + if (!forcedTargetFeatures.empty()) fir::setTargetFeatures(mod, forcedTargetFeatures); @@ -3714,7 +3694,8 @@ class FIRToLLVMLowering signalPassFailure(); } - // Run pass to add comdats to functions that have weak linkage on relevant platforms + // Run pass to add comdats to functions that have weak linkage on relevant + // platforms if (fir::getTargetTriple(mod).supportsCOMDAT()) { mlir::OpPassManager comdatPM("builtin.module"); comdatPM.addPass(mlir::LLVM::createLLVMAddComdats()); @@ -3789,16 +3770,19 @@ void fir::populateFIRToLLVMConversionPatterns( DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion, EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion, - GlobalOpConversion, HasValueOpConversion, InsertOnRangeOpConversion, - InsertValueOpConversion, IsPresentOpConversion, LenParamIndexOpConversion, - LoadOpConversion, MulcOpConversion, NegcOpConversion, - NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion, - SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion, - ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion, - StoreOpConversion, StringLitOpConversion, SubcOpConversion, - TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion, - UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion, - UnrealizedConversionCastOpConversion, XArrayCoorOpConversion, - XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter, - options); + GlobalOpConversion, InsertOnRangeOpConversion, IsPresentOpConversion, + LenParamIndexOpConversion, LoadOpConversion, MulcOpConversion, + NegcOpConversion, NoReassocOpConversion, SelectCaseOpConversion, + SelectOpConversion, SelectRankOpConversion, SelectTypeOpConversion, + ShapeOpConversion, ShapeShiftOpConversion, ShiftOpConversion, + SliceOpConversion, StoreOpConversion, StringLitOpConversion, + SubcOpConversion, TypeDescOpConversion, TypeInfoOpConversion, + UnboxCharOpConversion, UnboxProcOpConversion, UndefOpConversion, + UnreachableOpConversion, XArrayCoorOpConversion, XEmboxOpConversion, + XReboxOpConversion, ZeroOpConversion>(converter, options); + + // Patterns that are populated without a type converter do not trigger + // target materializations for the operands of the root op. + patterns.insert( + patterns.getContext()); } diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp index 652e2bddc1b896..25141102a8c432 100644 --- a/flang/lib/Optimizer/CodeGen/Target.cpp +++ b/flang/lib/Optimizer/CodeGen/Target.cpp @@ -1113,3 +1113,14 @@ fir::CodeGenSpecifics::get(mlir::MLIRContext *ctx, llvm::Triple &&trp, } TODO(mlir::UnknownLoc::get(ctx), "target not implemented"); } + +std::unique_ptr fir::CodeGenSpecifics::get( + mlir::MLIRContext *ctx, llvm::Triple &&trp, KindMapping &&kindMap, + llvm::StringRef targetCPU, mlir::LLVM::TargetFeaturesAttr targetFeatures, + const mlir::DataLayout &dl, llvm::StringRef tuneCPU) { + std::unique_ptr CGS = fir::CodeGenSpecifics::get( + ctx, std::move(trp), std::move(kindMap), targetCPU, targetFeatures, dl); + + CGS->tuneCPU = tuneCPU; + return CGS; +} diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index 561d700f412203..85bf90e4750633 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -89,6 +89,9 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { if (!forcedTargetCPU.empty()) fir::setTargetCPU(mod, forcedTargetCPU); + if (!forcedTuneCPU.empty()) + fir::setTuneCPU(mod, forcedTuneCPU); + if (!forcedTargetFeatures.empty()) fir::setTargetFeatures(mod, forcedTargetFeatures); @@ -106,7 +109,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { auto specifics = fir::CodeGenSpecifics::get( mod.getContext(), fir::getTargetTriple(mod), fir::getKindMapping(mod), - fir::getTargetCPU(mod), fir::getTargetFeatures(mod), *dl); + fir::getTargetCPU(mod), fir::getTargetFeatures(mod), *dl, + fir::getTuneCPU(mod)); setMembers(specifics.get(), &rewriter, &*dl); @@ -672,12 +676,18 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { auto targetCPU = specifics->getTargetCPU(); mlir::StringAttr targetCPUAttr = targetCPU.empty() ? nullptr : mlir::StringAttr::get(ctx, targetCPU); + auto tuneCPU = specifics->getTuneCPU(); + mlir::StringAttr tuneCPUAttr = + tuneCPU.empty() ? nullptr : mlir::StringAttr::get(ctx, tuneCPU); auto targetFeaturesAttr = specifics->getTargetFeatures(); for (auto fn : mod.getOps()) { if (targetCPUAttr) fn->setAttr("target_cpu", targetCPUAttr); + if (tuneCPUAttr) + fn->setAttr("tune_cpu", tuneCPUAttr); + if (targetFeaturesAttr) fn->setAttr("target_features", targetFeaturesAttr); diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp index ce86c625e082fd..59f67f4fcad44a 100644 --- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp +++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp @@ -35,7 +35,8 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA, kindMapping(getKindMapping(module)), specifics(CodeGenSpecifics::get( module.getContext(), getTargetTriple(module), getKindMapping(module), - getTargetCPU(module), getTargetFeatures(module), dl)), + getTargetCPU(module), getTargetFeatures(module), dl, + getTuneCPU(module))), tbaaBuilder(std::make_unique(module->getContext(), applyTBAA, forceUnifiedTBAATree)), dataLayout{&dl} { @@ -122,40 +123,6 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA, // Convert it here to i1 just in case it survives. return mlir::IntegerType::get(&getContext(), 1); }); - // FIXME: https://reviews.llvm.org/D82831 introduced an automatic - // materialization of conversion around function calls that is not working - // well with fir lowering to llvm (incorrect llvm.mlir.cast are inserted). - // Workaround until better analysis: register a handler that does not insert - // any conversions. - addSourceMaterialization( - [&](mlir::OpBuilder &builder, mlir::Type resultType, - mlir::ValueRange inputs, - mlir::Location loc) -> std::optional { - if (inputs.size() != 1) - return std::nullopt; - return inputs[0]; - }); - // Similar FIXME workaround here (needed for compare.fir/select-type.fir - // as well as rebox-global.fir tests). This is needed to cope with the - // the fact that codegen does not lower some operation results to the LLVM - // type produced by this LLVMTypeConverter. For instance, inside FIR - // globals, fir.box are lowered to llvm.struct, while the fir.box type - // conversion translates it into an llvm.ptr> because - // descriptors are manipulated in memory outside of global initializers - // where this is not possible. Hence, MLIR inserts - // builtin.unrealized_conversion_cast after the translation of operations - // producing fir.box in fir.global codegen. addSourceMaterialization and - // addTargetMaterialization allow ignoring these ops and removing them - // after codegen assuming the type discrepencies are intended (like for - // fir.box inside globals). - addTargetMaterialization( - [&](mlir::OpBuilder &builder, mlir::Type resultType, - mlir::ValueRange inputs, - mlir::Location loc) -> std::optional { - if (inputs.size() != 1) - return std::nullopt; - return inputs[0]; - }); } // i32 is used here because LLVM wants i32 constants when indexing into struct diff --git a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp index c4d00875c45e47..1aa631cb391269 100644 --- a/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp +++ b/flang/lib/Optimizer/Dialect/Support/FIRContext.cpp @@ -77,6 +77,24 @@ llvm::StringRef fir::getTargetCPU(mlir::ModuleOp mod) { return {}; } +static constexpr const char *tuneCpuName = "fir.tune_cpu"; + +void fir::setTuneCPU(mlir::ModuleOp mod, llvm::StringRef cpu) { + if (cpu.empty()) + return; + + auto *ctx = mod.getContext(); + + mod->setAttr(tuneCpuName, mlir::StringAttr::get(ctx, cpu)); +} + +llvm::StringRef fir::getTuneCPU(mlir::ModuleOp mod) { + if (auto attr = mod->getAttrOfType(tuneCpuName)) + return attr.getValue(); + + return {}; +} + static constexpr const char *targetFeaturesName = "fir.target_features"; void fir::setTargetFeatures(mlir::ModuleOp mod, llvm::StringRef features) { diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 24742826280ce3..e150495d189c4e 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -84,9 +84,9 @@ class OmpWorkshareBlockChecker { parser::CharBlock source_; }; -class OmpCycleAndExitChecker { +class AssociatedLoopChecker { public: - OmpCycleAndExitChecker(SemanticsContext &context, std::int64_t level) + AssociatedLoopChecker(SemanticsContext &context, std::int64_t level) : context_{context}, level_{level} {} template bool Pre(const T &) { return true; } @@ -94,11 +94,24 @@ class OmpCycleAndExitChecker { bool Pre(const parser::DoConstruct &dc) { level_--; - const auto &constructName{std::get<0>(std::get<0>(dc.t).statement.t)}; + const auto &doStmt{ + std::get>(dc.t)}; + const auto &constructName{ + std::get>(doStmt.statement.t)}; if (constructName) { constructNamesAndLevels_.emplace( constructName.value().ToString(), level_); } + if (level_ >= 0) { + if (dc.IsDoWhile()) { + context_.Say(doStmt.source, + "The associated loop of a loop-associated directive cannot be a DO WHILE."_err_en_US); + } + if (!dc.GetLoopControl()) { + context_.Say(doStmt.source, + "The associated loop of a loop-associated directive cannot be a DO without control."_err_en_US); + } + } return true; } @@ -450,9 +463,8 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { const auto &doBlock{std::get(doConstruct->t)}; CheckNoBranching(doBlock, beginDir.v, beginDir.source); } - CheckDoWhile(x); CheckLoopItrVariableIsInt(x); - CheckCycleConstraints(x); + CheckAssociatedLoopConstraints(x); HasInvalidDistributeNesting(x); if (CurrentDirectiveIsNested() && llvm::omp::topTeamsSet.test(GetContextParent().directive)) { @@ -478,21 +490,6 @@ void OmpStructureChecker::SetLoopInfo(const parser::OpenMPLoopConstruct &x) { } } } -void OmpStructureChecker::CheckDoWhile(const parser::OpenMPLoopConstruct &x) { - const auto &beginLoopDir{std::get(x.t)}; - const auto &beginDir{std::get(beginLoopDir.t)}; - if (beginDir.v == llvm::omp::Directive::OMPD_do) { - if (const auto &doConstruct{ - std::get>(x.t)}) { - if (doConstruct.value().IsDoWhile()) { - const auto &doStmt{std::get>( - doConstruct.value().t)}; - context_.Say(doStmt.source, - "The DO loop cannot be a DO WHILE with DO directive."_err_en_US); - } - } - } -} void OmpStructureChecker::CheckLoopItrVariableIsInt( const parser::OpenMPLoopConstruct &x) { @@ -647,8 +644,8 @@ std::int64_t OmpStructureChecker::GetOrdCollapseLevel( const auto &beginLoopDir{std::get(x.t)}; const auto &clauseList{std::get(beginLoopDir.t)}; std::int64_t orderedCollapseLevel{1}; - std::int64_t orderedLevel{0}; - std::int64_t collapseLevel{0}; + std::int64_t orderedLevel{1}; + std::int64_t collapseLevel{1}; for (const auto &clause : clauseList.v) { if (const auto *collapseClause{ @@ -672,10 +669,10 @@ std::int64_t OmpStructureChecker::GetOrdCollapseLevel( return orderedCollapseLevel; } -void OmpStructureChecker::CheckCycleConstraints( +void OmpStructureChecker::CheckAssociatedLoopConstraints( const parser::OpenMPLoopConstruct &x) { std::int64_t ordCollapseLevel{GetOrdCollapseLevel(x)}; - OmpCycleAndExitChecker checker{context_, ordCollapseLevel}; + AssociatedLoopChecker checker{context_, ordCollapseLevel}; parser::Walk(x, checker); } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 47705771e8e28f..2cc1a78068f540 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -186,7 +186,7 @@ class OmpStructureChecker void CheckLoopItrVariableIsInt(const parser::OpenMPLoopConstruct &x); void CheckDoWhile(const parser::OpenMPLoopConstruct &x); - void CheckCycleConstraints(const parser::OpenMPLoopConstruct &x); + void CheckAssociatedLoopConstraints(const parser::OpenMPLoopConstruct &x); template bool IsOperatorValid(const T &, const D &); void CheckAtomicMemoryOrderClause( const parser::OmpAtomicClauseList *, const parser::OmpAtomicClauseList *); diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp index f5b292a1f3d32c..2f7fcd2e2341fa 100644 --- a/flang/runtime/misc-intrinsic.cpp +++ b/flang/runtime/misc-intrinsic.cpp @@ -12,6 +12,7 @@ #include "flang/Common/optional.h" #include "flang/Runtime/descriptor.h" #include +#include #include namespace Fortran::runtime { @@ -55,6 +56,36 @@ static RT_API_ATTRS void TransferImpl(Descriptor &result, extern "C" { RT_EXT_API_GROUP_BEGIN +void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2, + const Descriptor *status, const char *sourceFile, int line) { + Terminator terminator{sourceFile, line}; + + char *pathSrc{EnsureNullTerminated( + path1.OffsetElement(), path1.ElementBytes(), terminator)}; + char *pathDst{EnsureNullTerminated( + path2.OffsetElement(), path2.ElementBytes(), terminator)}; + + // We simply call rename(2) from POSIX + int result{rename(pathSrc, pathDst)}; + if (status) { + // When an error has happened, + int errorCode{0}; // Assume success + if (result != 0) { + // The rename operation has failed, so return the error code as status. + errorCode = errno; + } + StoreIntToDescriptor(status, errorCode, terminator); + } + + // Deallocate memory if EnsureNullTerminated dynamically allocated memory + if (pathSrc != path1.OffsetElement()) { + FreeMemory(pathSrc); + } + if (pathDst != path2.OffsetElement()) { + FreeMemory(pathDst); + } +} + void RTDEF(Transfer)(Descriptor &result, const Descriptor &source, const Descriptor &mold, const char *sourceFile, int line) { Fortran::common::optional elements; diff --git a/flang/test/Driver/tune-cpu-fir.f90 b/flang/test/Driver/tune-cpu-fir.f90 new file mode 100644 index 00000000000000..43c13b426d5d9b --- /dev/null +++ b/flang/test/Driver/tune-cpu-fir.f90 @@ -0,0 +1,25 @@ +! RUN: %if aarch64-registered-target %{ %flang_fc1 -emit-fir -triple aarch64-unknown-linux-gnu -target-cpu aarch64 %s -o - | FileCheck %s --check-prefixes=ALL,ARMCPU %} +! RUN: %if aarch64-registered-target %{ %flang_fc1 -emit-fir -triple aarch64-unknown-linux-gnu -tune-cpu neoverse-n1 %s -o - | FileCheck %s --check-prefixes=ALL,ARMTUNE %} +! RUN: %if aarch64-registered-target %{ %flang_fc1 -emit-fir -triple aarch64-unknown-linux-gnu -target-cpu aarch64 -tune-cpu neoverse-n1 %s -o - | FileCheck %s --check-prefixes=ALL,ARMBOTH %} + +! RUN: %if x86-registered-target %{ %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o - | FileCheck %s --check-prefixes=ALL,X86CPU %} +! RUN: %if x86-registered-target %{ %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -tune-cpu pentium4 %s -o - | FileCheck %s --check-prefixes=ALL,X86TUNE %} +! RUN: %if x86-registered-target %{ %flang_fc1 -emit-fir -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -tune-cpu pentium4 %s -o - | FileCheck %s --check-prefixes=ALL,X86BOTH %} + +! ALL: module attributes { + +! ARMCPU-SAME: fir.target_cpu = "aarch64" +! ARMCPU-NOT: fir.tune_cpu = "neoverse-n1" + +! ARMTUNE-SAME: fir.tune_cpu = "neoverse-n1" + +! ARMBOTH-SAME: fir.target_cpu = "aarch64" +! ARMBOTH-SAME: fir.tune_cpu = "neoverse-n1" + +! X86CPU-SAME: fir.target_cpu = "x86-64" +! X86CPU-NOT: fir.tune_cpu = "pentium4" + +! X86TUNE-SAME: fir.tune_cpu = "pentium4" + +! X86BOTH-SAME: fir.target_cpu = "x86-64" +! X86BOTH-SAME: fir.tune_cpu = "pentium4" diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 7bbfd709b0aaf6..dda4f32872fef5 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -119,4 +119,5 @@ func.func @_QQmain() { // PASSES-NEXT: (S) 0 num-dce'd - Number of operations eliminated // PASSES-NEXT: TargetRewrite // PASSES-NEXT: FIRToLLVMLowering +// PASSES-NEXT: ReconcileUnrealizedCasts // PASSES-NEXT: LLVMIRLoweringPass diff --git a/flang/test/Lower/Intrinsics/rename.f90 b/flang/test/Lower/Intrinsics/rename.f90 new file mode 100644 index 00000000000000..75042217c6202f --- /dev/null +++ b/flang/test/Lower/Intrinsics/rename.f90 @@ -0,0 +1,51 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPtest_rename +!CHECK-SAME: %[[dummySrc:.*]]: !fir.boxchar<1> {fir.bindc_name = "src"}, +!CHECK-SAME: %[[dummyDst:.*]]: !fir.boxchar<1> {fir.bindc_name = "dst"}) { +subroutine test_rename(src, dst) + implicit none + character(*) :: src, dst + + call rename(src, dst) + !CHECK: %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEdst"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEsrc"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref>, index) -> !fir.box> + !CHECK-NEXT: %[[dstBox:.*]] = fir.embox %[[dstDecl]]#1 typeparams %[[dstUnbox]]#1 : (!fir.ref>, index) -> !fir.box> + !CHECK-NEXT: %[[statusBox:.*]] = fir.absent !fir.box + !CHECK-NEXT: %[[sourceFile:.*]] = fir.address_of(@[[someString:.*]]) : !fir.ref> + !CHECK-NEXT: %[[c10_i32:.*]] = arith.constant [[line:.*]] : i32 + !CHECK-NEXT: %[[src:.*]] = fir.convert %[[srcBox]] : (!fir.box>) -> !fir.box + !CHECK-NEXT: %[[dst:.*]] = fir.convert %[[dstBox]] : (!fir.box>) -> !fir.box + !CHECK-NEXT: %[[loc:.*]] = fir.convert %[[sourceFileConv:.*]]: (!fir.ref>) -> !fir.ref + !CHECK-NEXT: %[[result:.*]] = fir.call @_FortranARename(%[[src]], %[[dst]], %[[statusBox]], %[[loc]], %[[c10_i32]]) fastmath : (!fir.box, !fir.box, !fir.box, !fir.ref, i32) -> none +end subroutine test_rename + +!CHECK-LABEL: func.func @_QPtest_rename_status +!CHECK-SAME: %[[dummySrc:.*]]: !fir.boxchar<1> {fir.bindc_name = "src"}, +!CHECK-SAME: %[[dummyDst:.*]]: !fir.boxchar<1> {fir.bindc_name = "dst"}) { +subroutine test_rename_status(src, dst) + implicit none + character(*) :: src, dst + integer :: status + + call rename(src, dst, status) + !CHECK: %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEdst"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEsrc"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-NEXT: %[[statusAlloc:.*]] = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFtest_rename_statusEstatus"} + !CHECK-NEXT: %[[statusDecl:.*]]:2 = hlfir.declare %[[statusAlloc]] {uniq_name = "_QFtest_rename_statusEstatus"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref>, index) -> !fir.box> + !CHECK-NEXT: %[[dstBox:.*]] = fir.embox %[[dstDecl]]#1 typeparams %[[dstUnbox]]#1 : (!fir.ref>, index) -> !fir.box> + !CHECK-NEXT: %[[statusBox:.*]] = fir.embox %[[statusDecl]]#1 : (!fir.ref) -> !fir.box + !CHECK-NEXT: %[[sourceFile:.*]] = fir.address_of(@[[someString:.*]]) : !fir.ref> + !CHECK-NEXT: %[[c10_i32:.*]] = arith.constant [[line:.*]] : i32 + !CHECK-NEXT: %[[src:.*]] = fir.convert %[[srcBox]] : (!fir.box>) -> !fir.box + !CHECK-NEXT: %[[dst:.*]] = fir.convert %[[dstBox]] : (!fir.box>) -> !fir.box + !CHECK-NEXT: %[[status:.*]] = fir.convert %[[statusBox]] : (!fir.box) -> !fir.box + !CHECK-NEXT: %[[loc:.*]] = fir.convert %[[sourceFileConv:.*]]: (!fir.ref>) -> !fir.ref + !CHECK-NEXT: %[[result:.*]] = fir.call @_FortranARename(%[[src]], %[[dst]], %[[status]], %[[loc]], %[[c10_i32]]) fastmath : (!fir.box, !fir.box, !fir.box, !fir.ref, i32) -> none +end subroutine test_rename_status diff --git a/flang/test/Lower/Intrinsics/second.f90 b/flang/test/Lower/Intrinsics/second.f90 new file mode 100644 index 00000000000000..f1e66506aaaca9 --- /dev/null +++ b/flang/test/Lower/Intrinsics/second.f90 @@ -0,0 +1,37 @@ +!RUN: bbc -emit-hlfir %s -o - | FileCheck %s +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +subroutine test_subroutine(time) + real :: time + call second(time) +end subroutine +! CHECK-LABEL: func.func @_QPtest_subroutine( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref {fir.bindc_name = "time"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_subroutineEtime"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_3:.*]] = fir.call @_FortranACpuTime() fastmath : () -> f64 +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (f64) -> f32 +! CHECK: fir.store %[[VAL_4]] to %[[VAL_2]]#1 : !fir.ref +! CHECK: return +! CHECK: } + + +subroutine test_function(time) + real :: time + time = second() +end subroutine +! CHECK-LABEL: func.func @_QPtest_function( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref {fir.bindc_name = "time"}) { +! CHECK: %[[VAL_1:.*]] = fir.alloca f32 +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_functionEtime"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_4:.*]] = fir.call @_FortranACpuTime() fastmath : () -> f64 +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (f64) -> f32 +! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref +! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_7:.*]] = arith.constant false +! CHECK: %[[VAL_8:.*]] = hlfir.as_expr %[[VAL_6]]#0 move %[[VAL_7]] : (!fir.ref, i1) -> !hlfir.expr +! CHECK: hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : !hlfir.expr, !fir.ref +! CHECK: hlfir.destroy %[[VAL_8]] : !hlfir.expr +! CHECK: return +! CHECK: } diff --git a/flang/test/Lower/namelist.f90 b/flang/test/Lower/namelist.f90 index 9fdd8a2c8f6131..a96bbbfad0cd6b 100644 --- a/flang/test/Lower/namelist.f90 +++ b/flang/test/Lower/namelist.f90 @@ -71,7 +71,7 @@ program p ! CHECK: %[[V_70:[0-9]+]] = fir.call @_FortranAioEndIoStatement(%[[V_58]]) fastmath : (!fir.ref) -> i32 write(*, nnn) - call rename + call rename_sub end ! CHECK-LABEL: c.func @_QPsss @@ -128,8 +128,8 @@ module mmm namelist /aaa/ rrr end -! CHECK-LABEL: c.func @_QPrename -subroutine rename +! CHECK-LABEL: c.func @_QPrename_sub +subroutine rename_sub use mmm, bbb => aaa rrr = 3. ! CHECK: %[[V_4:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput diff --git a/flang/test/Lower/tune-cpu-llvm.f90 b/flang/test/Lower/tune-cpu-llvm.f90 new file mode 100644 index 00000000000000..dc2a68730cf23c --- /dev/null +++ b/flang/test/Lower/tune-cpu-llvm.f90 @@ -0,0 +1,8 @@ +! RUN: %if x86-registered-target %{ %flang -mtune=pentium4 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-X86 %} +! RUN: %if aarch64-registered-target %{ %flang -mtune=neoverse-n1 -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,CHECK-ARM %} + +!ALL: attributes #{{[0-9]+}} = { +!CHECK-X86-SAME: "tune-cpu"="pentium4" +!CHECK-ARM-SAME: "tune-cpu"="neoverse-n1" +subroutine a +end subroutine a diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 22ac57065ffecb..745895248ddf4d 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -173,6 +173,7 @@ outer: do i=0, 10 inner: do j=1, 10 exit + !ERROR: EXIT statement terminates associated loop of an OpenMP DO construct exit outer !ERROR: EXIT to construct 'outofparallel' outside of PARALLEL construct is not allowed !ERROR: EXIT to construct 'outofparallel' outside of DO construct is not allowed diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90 index 145b7b75d28dfc..4f2512937ace4e 100644 --- a/flang/test/Semantics/OpenMP/do-collapse.f90 +++ b/flang/test/Semantics/OpenMP/do-collapse.f90 @@ -26,6 +26,7 @@ program omp_doCollapse !$omp parallel do collapse(2) do i = 1, 3 !ERROR: Loop control is not present in the DO LOOP + !ERROR: The associated loop of a loop-associated directive cannot be a DO without control. do end do end do diff --git a/flang/test/Semantics/OpenMP/do09.f90 b/flang/test/Semantics/OpenMP/do09.f90 index af9f2e294ace96..624a11555f1053 100644 --- a/flang/test/Semantics/OpenMP/do09.f90 +++ b/flang/test/Semantics/OpenMP/do09.f90 @@ -6,7 +6,7 @@ program omp_do integer :: i = 0,k !$omp do - !ERROR: The DO loop cannot be a DO WHILE with DO directive. + !ERROR: The associated loop of a loop-associated directive cannot be a DO WHILE. do while (i <= 10) print *, "it",i i = i+1 @@ -14,7 +14,7 @@ program omp_do !$omp end do !$omp do - !ERROR: The DO loop cannot be a DO WHILE with DO directive. + !ERROR: The associated loop of a loop-associated directive cannot be a DO WHILE. do while (i <= 10) do while (j <= 10) print *, "it",k diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index e5e41ad3e9cf29..07eef065daf6f4 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -367,11 +367,12 @@ static llvm::LogicalResult convertFortranSourceToMLIR( loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR); loweringOptions.setNSWOnLoopVarInc(setNSW); std::vector envDefaults = {}; + constexpr const char *tuneCPU = ""; auto burnside = Fortran::lower::LoweringBridge::create( ctx, semanticsContext, defKinds, semanticsContext.intrinsics(), semanticsContext.targetCharacteristics(), parsing.allCooked(), targetTriple, kindMap, loweringOptions, envDefaults, - semanticsContext.languageFeatures(), targetMachine); + semanticsContext.languageFeatures(), targetMachine, tuneCPU); mlir::ModuleOp mlirModule = burnside.getModule(); if (enableOpenMP) { if (enableOpenMPGPU && !enableOpenMPDevice) { diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp index 34ac0e1a5cb98a..a8c64333109aeb 100644 --- a/flang/tools/tco/tco.cpp +++ b/flang/tools/tco/tco.cpp @@ -58,6 +58,9 @@ static cl::opt targetTriple("target", static cl::opt targetCPU("target-cpu", cl::desc("specify a target CPU"), cl::init("")); +static cl::opt tuneCPU("tune-cpu", cl::desc("specify a tune CPU"), + cl::init("")); + static cl::opt targetFeatures("target-features", cl::desc("specify the target features"), cl::init("")); @@ -113,6 +116,7 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) { fir::setTargetTriple(*owningRef, targetTriple); fir::setKindMapping(*owningRef, kindMap); fir::setTargetCPU(*owningRef, targetCPU); + fir::setTuneCPU(*owningRef, tuneCPU); fir::setTargetFeatures(*owningRef, targetFeatures); // tco is a testing tool, so it will happily use the target independent // data layout if none is on the module. diff --git a/flang/unittests/Optimizer/FIRContextTest.cpp b/flang/unittests/Optimizer/FIRContextTest.cpp index 49e1ebf23d8aa6..3f8b59ac94a95a 100644 --- a/flang/unittests/Optimizer/FIRContextTest.cpp +++ b/flang/unittests/Optimizer/FIRContextTest.cpp @@ -34,6 +34,7 @@ struct StringAttributesTests : public testing::Test { "i10:80,l3:24,a1:8,r54:Double,r62:X86_FP80,r11:PPC_FP128"; std::string target = "powerpc64le-unknown-linux-gnu"; std::string targetCPU = "gfx90a"; + std::string tuneCPU = "generic"; std::string targetFeatures = "+gfx9-insts,+wavefrontsize64"; mlir::ModuleOp mod; }; @@ -42,6 +43,7 @@ TEST_F(StringAttributesTests, moduleStringAttrTest) { setTargetTriple(mod, target); setKindMapping(mod, *kindMap); setTargetCPU(mod, targetCPU); + setTuneCPU(mod, tuneCPU); setTargetFeatures(mod, targetFeatures); auto triple = getTargetTriple(mod); @@ -61,6 +63,7 @@ TEST_F(StringAttributesTests, moduleStringAttrTest) { EXPECT_TRUE(mapStr.find("r62:X86_FP80") != std::string::npos); EXPECT_EQ(getTargetCPU(mod), targetCPU); + EXPECT_EQ(getTuneCPU(mod), tuneCPU); auto features = getTargetFeatures(mod); auto featuresList = features.getFeatures(); diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index d167abcaf2db1c..eaeecbdacd23ec 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -43,6 +43,7 @@ add_unittest_framework_library( libc.src.__support.CPP.functional libc.src.__support.CPP.limits libc.src.__support.CPP.algorithm + libc.src.__support.CPP.atomic libc.src.__support.fixed_point.fx_rep libc.src.__support.macros.properties.types libc.src.__support.OSUtil.osutil diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 1c1ba7639d0b17..23fff3e8180f7d 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -1,6 +1,7 @@ #include "LibcGpuBenchmark.h" #include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" +#include "src/__support/CPP/atomic.h" #include "src/__support/CPP/string.h" #include "src/__support/FPUtil/sqrt.h" #include "src/__support/GPU/utils.h" @@ -12,41 +13,96 @@ namespace LIBC_NAMESPACE_DECL { namespace benchmarks { FixedVector benchmarks; -cpp::array results; void Benchmark::add_benchmark(Benchmark *benchmark) { benchmarks.push_back(benchmark); } -BenchmarkResult -reduce_results(const cpp::array &results) { - BenchmarkResult result; - uint64_t cycles_sum = 0; - double standard_deviation_sum = 0; - uint64_t min = UINT64_MAX; - uint64_t max = 0; - uint32_t samples_sum = 0; - uint32_t iterations_sum = 0; - clock_t time_sum = 0; - uint64_t num_threads = gpu::get_num_threads(); - for (uint64_t i = 0; i < num_threads; i++) { - BenchmarkResult current_result = results[i]; - cycles_sum += current_result.cycles; - standard_deviation_sum += current_result.standard_deviation; - min = cpp::min(min, current_result.min); - max = cpp::max(max, current_result.max); - samples_sum += current_result.samples; - iterations_sum += current_result.total_iterations; - time_sum += current_result.total_time; +struct AtomicBenchmarkSums { + cpp::Atomic cycles_sum = 0; + cpp::Atomic standard_deviation_sum = 0; + cpp::Atomic min = UINT64_MAX; + cpp::Atomic max = 0; + cpp::Atomic samples_sum = 0; + cpp::Atomic iterations_sum = 0; + cpp::Atomic time_sum = 0; + cpp::Atomic active_threads = 0; + + void reset() { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + active_threads.store(0, cpp::MemoryOrder::RELAXED); + cycles_sum.store(0, cpp::MemoryOrder::RELAXED); + standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED); + min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED); + max.store(0, cpp::MemoryOrder::RELAXED); + samples_sum.store(0, cpp::MemoryOrder::RELAXED); + iterations_sum.store(0, cpp::MemoryOrder::RELAXED); + time_sum.store(0, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); } - result.cycles = cycles_sum / num_threads; - result.standard_deviation = standard_deviation_sum / num_threads; - result.min = min; - result.max = max; - result.samples = samples_sum / num_threads; - result.total_iterations = iterations_sum / num_threads; - result.total_time = time_sum / num_threads; - return result; + + void update(const BenchmarkResult &result) { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED); + + cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED); + standard_deviation_sum.fetch_add( + static_cast(result.standard_deviation), + cpp::MemoryOrder::RELAXED); + + // Perform a CAS loop to atomically update the min + uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED); + while (!min.compare_exchange_strong( + orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + ; + + // Perform a CAS loop to atomically update the max + uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED); + while (!max.compare_exchange_strong( + orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE, + cpp::MemoryOrder::RELAXED)) + ; + + samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED); + iterations_sum.fetch_add(result.total_iterations, + cpp::MemoryOrder::RELAXED); + time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + } +}; + +AtomicBenchmarkSums all_results; + +void print_results(Benchmark *b) { + constexpr auto GREEN = "\033[32m"; + constexpr auto RESET = "\033[0m"; + + BenchmarkResult result; + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED); + result.cycles = + all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; + result.standard_deviation = + all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / + num_threads; + result.min = all_results.min.load(cpp::MemoryOrder::RELAXED); + result.max = all_results.max.load(cpp::MemoryOrder::RELAXED); + result.samples = + all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; + result.total_iterations = + all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; + result.total_time = + all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + + log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n'; + log << GREEN << "[ OK ] " << RESET << b->get_name() << ": " + << result.cycles << " cycles, " << result.min << " min, " << result.max + << " max, " << result.total_iterations << " iterations, " + << result.total_time << " ns, " + << static_cast(result.standard_deviation) + << " stddev (num threads: " << num_threads << ")\n"; } void Benchmark::run_benchmarks() { @@ -54,19 +110,16 @@ void Benchmark::run_benchmarks() { gpu::sync_threads(); for (Benchmark *b : benchmarks) { - results[id] = b->run(); + if (id == 0) + all_results.reset(); + gpu::sync_threads(); - if (id == 0) { - BenchmarkResult all_results = reduce_results(results); - constexpr auto GREEN = "\033[32m"; - constexpr auto RESET = "\033[0m"; - log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n'; - log << GREEN << "[ OK ] " << RESET << b->get_name() << ": " - << all_results.cycles << " cycles, " << all_results.min << " min, " - << all_results.max << " max, " << all_results.total_iterations - << " iterations, " << all_results.total_time << " ns, " - << static_cast(all_results.standard_deviation) << " stddev\n"; - } + auto current_result = b->run(); + all_results.update(current_result); + gpu::sync_threads(); + + if (id == 0) + print_results(b); } gpu::sync_threads(); } diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 26cb0fd30bc1cb..1f813f8655de6a 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -88,6 +88,7 @@ class Benchmark { } static void run_benchmarks(); + const cpp::string_view get_name() const { return name; } protected: static void add_benchmark(Benchmark *benchmark); @@ -97,13 +98,12 @@ class Benchmark { BenchmarkOptions options; return benchmark(options, func); } - const cpp::string_view get_name() const { return name; } }; } // namespace benchmarks } // namespace LIBC_NAMESPACE_DECL #define BENCHMARK(SuiteName, TestName, Func) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ - Func, #SuiteName "." #TestName); + Func, #SuiteName "." #TestName) #endif diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake index d84c07b35d2d78..a6d793d495c45e 100644 --- a/libc/cmake/modules/CheckCompilerFeatures.cmake +++ b/libc/cmake/modules/CheckCompilerFeatures.cmake @@ -2,7 +2,15 @@ # Compiler features definition and flags # ------------------------------------------------------------------------------ -set(ALL_COMPILER_FEATURES "float16" "float128" "fixed_point") +set( + ALL_COMPILER_FEATURES + "builtin_ceil_floor_rint_trunc" + "builtin_round" + "builtin_roundeven" + "float16" + "float128" + "fixed_point" +) # Making sure ALL_COMPILER_FEATURES is sorted. list(SORT ALL_COMPILER_FEATURES) @@ -39,11 +47,22 @@ endfunction() set(AVAILABLE_COMPILER_FEATURES "") # Try compile a C file to check if flag is supported. -set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) foreach(feature IN LISTS ALL_COMPILER_FEATURES) + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) set(compile_options ${LIBC_COMPILE_OPTIONS_NATIVE}) + set(link_options "") if(${feature} STREQUAL "fixed_point") list(APPEND compile_options "-ffixed-point") + elseif(${feature} MATCHES "^builtin_") + set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT}) + set(link_options -nostdlib) + # The compiler might handle calls to rounding builtins by generating calls + # to the respective libc math functions, in which case we cannot use these + # builtins in our implementations of these functions. We check that this is + # not the case by trying to link an executable, since linking would fail due + # to unresolved references with -nostdlib if calls to libc functions were + # generated. + set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE) endif() try_compile( @@ -51,6 +70,7 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) ${CMAKE_CURRENT_BINARY_DIR}/compiler_features SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options} + LINK_OPTIONS ${link_options} ) if(has_feature) list(APPEND AVAILABLE_COMPILER_FEATURES ${feature}) @@ -60,6 +80,12 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES) set(LIBC_TYPES_HAS_FLOAT128 TRUE) elseif(${feature} STREQUAL "fixed_point") set(LIBC_COMPILER_HAS_FIXED_POINT TRUE) + elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc") + set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE) + elseif(${feature} STREQUAL "builtin_round") + set(LIBC_COMPILER_HAS_BUILTIN_ROUND TRUE) + elseif(${feature} STREQUAL "builtin_roundeven") + set(LIBC_COMPILER_HAS_BUILTIN_ROUNDEVEN TRUE) endif() endif() endforeach() diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index c5e7dfe8abd0fb..62e16d52cb3ea0 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -4,7 +4,7 @@ function(_get_compile_options_from_flags output_var) if(LIBC_TARGET_ARCHITECTURE_IS_RISCV64 OR(LIBC_CPU_FEATURES MATCHES "FMA")) check_flag(ADD_FMA_FLAG ${FMA_OPT_FLAG} ${ARGN}) endif() - check_flag(ADD_SSE4_2_FLAG ${ROUND_OPT_FLAG} ${ARGN}) + check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN}) check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN}) if(LLVM_COMPILER_IS_GCC_COMPATIBLE) @@ -16,8 +16,23 @@ function(_get_compile_options_from_flags output_var) list(APPEND compile_options "-D__LIBC_RISCV_USE_FMA") endif() endif() - if(ADD_SSE4_2_FLAG) - list(APPEND compile_options "-msse4.2") + if(ADD_ROUND_OPT_FLAG) + if(LIBC_TARGET_ARCHITECTURE_IS_X86) + # ROUND_OPT_FLAG is only enabled if SSE4.2 is detected, not just SSE4.1, + # because there was code to check for SSE4.2 already, and few CPUs only + # have SSE4.1. + list(APPEND compile_options "-msse4.2") + endif() + if(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC) + list(APPEND compile_options + "-D__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC") + endif() + if(LIBC_COMPILER_HAS_BUILTIN_ROUND) + list(APPEND compile_options "-D__LIBC_USE_BUILTIN_ROUND") + endif() + if(LIBC_COMPILER_HAS_BUILTIN_ROUNDEVEN) + list(APPEND compile_options "-D__LIBC_USE_BUILTIN_ROUNDEVEN") + endif() endif() if(ADD_EXPLICIT_SIMD_OPT_FLAG) list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT") @@ -34,10 +49,21 @@ function(_get_compile_options_from_flags output_var) set(${output_var} ${compile_options} PARENT_SCOPE) endfunction(_get_compile_options_from_flags) +function(_get_compile_options_from_config output_var) + set(config_options "") + + if(LIBC_CONF_QSORT_IMPL) + list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}") + endif() + + set(${output_var} ${config_options} PARENT_SCOPE) +endfunction(_get_compile_options_from_config) + function(_get_common_compile_options output_var flags) _get_compile_options_from_flags(compile_flags ${flags}) + _get_compile_options_from_config(config_flags) - set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}) + set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags} ${config_flags}) if(LLVM_COMPILER_IS_GCC_COMPATIBLE) list(APPEND compile_options "-fpie") diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake index 18e36dfde5cc19..eca7ba8d183e6b 100644 --- a/libc/cmake/modules/LLVMLibCFlagRules.cmake +++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake @@ -277,6 +277,7 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2"))) endif() # Skip ROUND_OPT flag for targets that don't support SSE 4.2. -if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2"))) +if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR + LIBC_TARGET_ARCHITECTURE_IS_AARCH64)) set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE) endif() diff --git a/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_rint_trunc.cpp b/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_rint_trunc.cpp new file mode 100644 index 00000000000000..946200001d69f0 --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_rint_trunc.cpp @@ -0,0 +1,11 @@ +float try_builtin_ceilf(float x) { return __builtin_ceilf(x); } +float try_builtin_floorf(float x) { return __builtin_floorf(x); } +float try_builtin_rintf(float x) { return __builtin_rintf(x); } +float try_builtin_truncf(float x) { return __builtin_truncf(x); } + +double try_builtin_ceil(double x) { return __builtin_ceil(x); } +double try_builtin_floor(double x) { return __builtin_floor(x); } +double try_builtin_rint(double x) { return __builtin_rint(x); } +double try_builtin_trunc(double x) { return __builtin_trunc(x); } + +extern "C" void _start() {} diff --git a/libc/cmake/modules/compiler_features/check_builtin_round.cpp b/libc/cmake/modules/compiler_features/check_builtin_round.cpp new file mode 100644 index 00000000000000..79a347ada8b601 --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_builtin_round.cpp @@ -0,0 +1,5 @@ +float try_builtin_roundf(float x) { return __builtin_roundf(x); } + +double try_builtin_round(double x) { return __builtin_round(x); } + +extern "C" void _start() {} diff --git a/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp b/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp new file mode 100644 index 00000000000000..0aa40dc7f4b7a9 --- /dev/null +++ b/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp @@ -0,0 +1,5 @@ +float try_builtin_roundevenf(float x) { return __builtin_roundevenf(x); } + +double try_builtin_roundeven(double x) { return __builtin_roundeven(x); } + +extern "C" void _start() {} diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td index a6547d843c85ee..7421d86fabeb08 100644 --- a/libc/config/baremetal/api.td +++ b/libc/config/baremetal/api.td @@ -5,41 +5,6 @@ include "spec/stdc_ext.td" include "spec/bsd_ext.td" include "spec/llvm_libc_stdfix_ext.td" -def AssertMacro : MacroDef<"assert"> { - let Defn = [{ - #undef assert - - #ifdef NDEBUG - #define assert(e) (void)0 - #else - - #ifdef __cplusplus - extern "C" - #endif - _Noreturn void __assert_fail(const char *, const char *, unsigned, const char *) __NOEXCEPT; - - #define assert(e) \ - ((e) ? (void)0 : __assert_fail(#e, __FILE__, __LINE__, __PRETTY_FUNCTION__)) - - #endif - }]; -} - -def StaticAssertMacro : MacroDef<"static_assert"> { - let Defn = [{ - #ifndef __cplusplus - #undef static_assert - #define static_assert _Static_assert - #endif - }]; -} - -def AssertAPI : PublicAPI<"assert.h"> { - let Macros = [ - AssertMacro, - StaticAssertMacro, - ]; -} def CTypeAPI : PublicAPI<"ctype.h"> { } diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json index b7426dd341d977..12f4c2aa3a805c 100644 --- a/libc/config/baremetal/config.json +++ b/libc/config/baremetal/config.json @@ -22,5 +22,10 @@ "LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE": { "value": 102400 } + }, + "qsort": { + "LIBC_CONF_QSORT_IMPL": { + "value": "LIBC_QSORT_HEAP_SORT" + } } } diff --git a/libc/config/config.json b/libc/config/config.json index 3a9c08d195445a..94bfed894c173c 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -82,5 +82,11 @@ "value": 0, "doc": "Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST." } + }, + "qsort": { + "LIBC_CONF_QSORT_IMPL": { + "value": "LIBC_QSORT_QUICK_SORT", + "doc": "Configures sorting algorithm for qsort and qsort_r. Values accepted are LIBC_QSORT_QUICK_SORT, LIBC_QSORT_HEAP_SORT." + } } } diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td index 21ddbb95b70c91..995ff31c4ac9e9 100644 --- a/libc/config/gpu/api.td +++ b/libc/config/gpu/api.td @@ -7,35 +7,6 @@ include "spec/gnu_ext.td" include "spec/stdc_ext.td" include "spec/llvm_libc_ext.td" -def AssertMacro : MacroDef<"assert"> { - let Defn = [{ - #undef assert - - #ifdef NDEBUG - #define assert(e) (void)0 - #else - - #define assert(e) \ - ((e) ? (void)0 : __assert_fail(#e, __FILE__, __LINE__, __PRETTY_FUNCTION__)) - #endif - }]; -} - -def StaticAssertMacro : MacroDef<"static_assert"> { - let Defn = [{ - #ifndef __cplusplus - #undef static_assert - #define static_assert _Static_assert - #endif - }]; -} - -def AssertAPI : PublicAPI<"assert.h"> { - let Macros = [ - AssertMacro, - StaticAssertMacro, - ]; -} def StringAPI : PublicAPI<"string.h"> { let Types = ["size_t"]; diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 624ac2715579f1..63228216c85ec7 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -218,6 +218,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.clock libc.src.time.nanosleep + # wchar.h entrypoints + libc.src.wchar.wctob + # gpu/rpc.h entrypoints libc.src.gpu.rpc_host_call libc.src.gpu.rpc_fprintf diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/headers.txt index dd16938da8a447..1d4038d5eb45a4 100644 --- a/libc/config/gpu/headers.txt +++ b/libc/config/gpu/headers.txt @@ -12,6 +12,8 @@ set(TARGET_PUBLIC_HEADERS libc.include.errno libc.include.stdlib libc.include.stdio + libc.include.wchar + libc.include.uchar # Header for RPC extensions libc.include.gpu_rpc diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 60e9b70f0d8a41..a10dec06e2452c 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -9,42 +9,6 @@ include "spec/stdc_ext.td" include "spec/llvm_libc_ext.td" include "spec/llvm_libc_stdfix_ext.td" -def AssertMacro : MacroDef<"assert"> { - let Defn = [{ - #undef assert - - #ifdef NDEBUG - #define assert(e) (void)0 - #else - - #ifdef __cplusplus - extern "C" - #endif - _Noreturn void __assert_fail(const char *, const char *, unsigned, const char *) __NOEXCEPT; - - #define assert(e) \ - ((e) ? (void)0 : __assert_fail(#e, __FILE__, __LINE__, __PRETTY_FUNCTION__)) - - #endif - }]; -} - -def StaticAssertMacro : MacroDef<"static_assert"> { - let Defn = [{ - #ifndef __cplusplus - #undef static_assert - #define static_assert _Static_assert - #endif - }]; -} - -def AssertAPI : PublicAPI<"assert.h"> { - let Macros = [ - AssertMacro, - StaticAssertMacro, - ]; -} - def CTypeAPI : PublicAPI<"ctype.h"> { } diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 24ef2ef189ffd9..dfb35f6a6611a4 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -44,6 +44,8 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT``: Default number of spins before blocking if a mutex is in contention (default to 100). - ``LIBC_CONF_RWLOCK_DEFAULT_SPIN_COUNT``: Default number of spins before blocking if a rwlock is in contention (default to 100). - ``LIBC_CONF_TIMEOUT_ENSURE_MONOTONICITY``: Automatically adjust timeout to CLOCK_MONOTONIC (default to true). POSIX API may require CLOCK_REALTIME, which can be unstable and leading to unexpected behavior. This option will convert the real-time timestamp to monotonic timestamp relative to the time of call. +* **"qsort" options** + - ``LIBC_CONF_QSORT_IMPL``: Configures sorting algorithm for qsort and qsort_r. Values accepted are LIBC_QSORT_QUICK_SORT, LIBC_QSORT_HEAP_SORT. * **"scanf" options** - ``LIBC_CONF_SCANF_DISABLE_FLOAT``: Disable parsing floating point values in scanf and friends. - ``LIBC_CONF_SCANF_DISABLE_INDEX_MODE``: Disable index mode in the scanf format string. diff --git a/libc/include/assert.h.def b/libc/include/assert.h.def index 15077e53e2ca48..9c924c7f585457 100644 --- a/libc/include/assert.h.def +++ b/libc/include/assert.h.def @@ -12,4 +12,24 @@ // This file may be usefully included multiple times to change assert()'s // definition based on NDEBUG. + +#undef assert +#ifdef NDEBUG +#define assert(e) (void)0 +#else + +#ifndef __cplusplus +#undef static_assert +#define static_assert _Static_assert +#endif + +#ifdef __cplusplus +extern "C" +#endif +_Noreturn void __assert_fail(const char *, const char *, unsigned, const char *) __NOEXCEPT; + +#define assert(e) \ + ((e) ? (void)0 : __assert_fail(#e, __FILE__, __LINE__, __PRETTY_FUNCTION__)) +#endif + %%public_api() diff --git a/libc/include/uchar.h.def b/libc/include/uchar.h.def index 31b7fcb73ded6a..83400f4aba2eef 100644 --- a/libc/include/uchar.h.def +++ b/libc/include/uchar.h.def @@ -10,6 +10,7 @@ #define LLVM_LIBC_UCHAR_H #include "__llvm-libc-common.h" +#include "llvm-libc-types/mbstate_t.h" %%public_api() diff --git a/libc/include/wchar.h.def b/libc/include/wchar.h.def index 4c25de700d6063..d0de1a6762a392 100644 --- a/libc/include/wchar.h.def +++ b/libc/include/wchar.h.def @@ -11,6 +11,8 @@ #include "__llvm-libc-common.h" #include "llvm-libc-macros/wchar-macros.h" +#include "llvm-libc-types/wint_t.h" +#include "llvm-libc-types/mbstate_t.h" %%public_api() diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py index ea5e8223a538ef..ccfd93547c1d8a 100644 --- a/libc/newhdrgen/class_implementation/classes/function.py +++ b/libc/newhdrgen/class_implementation/classes/function.py @@ -26,7 +26,7 @@ def __str__(self): attributes_str = " ".join(self.attributes) arguments_str = ", ".join(self.arguments) if attributes_str == "": - result = f"{self.return_type} {self.name}({arguments_str}) __NOEXCEPT;" + result = f"{self.return_type} {self.name}({arguments_str});" else: - result = f"{attributes_str} {self.return_type} {self.name}({arguments_str}) __NOEXCEPT;" + result = f"{attributes_str} {self.return_type} {self.name}({arguments_str})" return result diff --git a/libc/newhdrgen/gpu_headers.py b/libc/newhdrgen/gpu_headers.py new file mode 100644 index 00000000000000..cc13096cd47c11 --- /dev/null +++ b/libc/newhdrgen/gpu_headers.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# ===- GPU HeaderFile Class for --export-decls version --------*- python -*--==# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ==-------------------------------------------------------------------------==# + + +class GpuHeaderFile: + def __init__(self, name): + self.name = name + self.macros = [] + self.types = [] + self.enumerations = [] + self.objects = [] + self.functions = [] + self.includes = [] + + def add_macro(self, macro): + self.macros.append(macro) + + def add_type(self, type_): + self.types.append(type_) + + def add_enumeration(self, enumeration): + self.enumerations.append(enumeration) + + def add_object(self, object): + self.objects.append(object) + + def add_function(self, function): + self.functions.append(function) + + def __str__(self): + content = [] + + content.append( + f"//===-- C standard declarations for {self.name} ------------------------------===//" + ) + content.append("//") + content.append( + "// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions." + ) + content.append("// See https://llvm.org/LICENSE.txt for license information.") + content.append("// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception") + content.append("//") + content.append( + "//===----------------------------------------------------------------------===//\n" + ) + + header_guard = f"__LLVM_LIBC_DECLARATIONS_{self.name.upper()[:-2]}_H" + content.append(f"#ifndef {header_guard}") + content.append(f"#define {header_guard}\n") + + content.append("#ifndef __LIBC_ATTRS") + content.append("#define __LIBC_ATTRS") + content.append("#endif\n") + + content.append("#ifdef __cplusplus") + content.append('extern "C" {') + content.append("#endif\n") + + for function in self.functions: + content.append(f"{function} __LIBC_ATTRS;\n") + + for object in self.objects: + content.append(f"{object} __LIBC_ATTRS;\n") + + content.append("#ifdef __cplusplus") + content.append("}") + content.append("#endif\n") + + content.append(f"#endif") + + return "\n".join(content) diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py index ac45bae7c933e2..69de81eebb7198 100644 --- a/libc/newhdrgen/header.py +++ b/libc/newhdrgen/header.py @@ -60,16 +60,16 @@ def __str__(self): current_guard = None for function in self.functions: if function.guard == None: - content.append(str(function)) + content.append(str(function) + "__NOEXCEPT") content.append("") else: if current_guard == None: current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function)) + content.append(str(function) + "__NOEXCEPT") content.append("") elif current_guard == function.guard: - content.append(str(function)) + content.append(str(function) + "__NOEXCEPT") content.append("") else: content.pop() @@ -77,12 +77,12 @@ def __str__(self): content.append("") current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function)) + content.append(str(function) + "__NOEXCEPT") content.append("") if current_guard != None: content.pop() content.append(f"#endif // {current_guard}") - content.append("") + content.append("") for object in self.objects: content.append(str(object)) diff --git a/libc/newhdrgen/yaml/assert.yaml b/libc/newhdrgen/yaml/assert.yaml new file mode 100644 index 00000000000000..9ad0f0628274e6 --- /dev/null +++ b/libc/newhdrgen/yaml/assert.yaml @@ -0,0 +1,16 @@ +header: assert.h +macros: [] +types: [] +enums: [] +objects: [] +functions: + - name: __assert_fail + standards: + - llvm_libc_ext + return_type: _Noreturn void + arguments: + - type: const char * + - type: const char * + - type: unsigned + - type: const char * + guard: __cplusplus diff --git a/libc/newhdrgen/yaml/features.yaml b/libc/newhdrgen/yaml/features.yaml new file mode 100644 index 00000000000000..86bc0acfe89ed2 --- /dev/null +++ b/libc/newhdrgen/yaml/features.yaml @@ -0,0 +1,8 @@ +header: features.h +standards: + - stdc +macros: [] +types: [] +enums: [] +objects: [] +functions: [] diff --git a/libc/newhdrgen/yaml/pthread.yaml b/libc/newhdrgen/yaml/pthread.yaml index 14a562082d5deb..f22767eb1b752e 100644 --- a/libc/newhdrgen/yaml/pthread.yaml +++ b/libc/newhdrgen/yaml/pthread.yaml @@ -13,33 +13,8 @@ types: - type_name: __pthread_start_t - type_name: __pthread_once_func_t - type_name: __atfork_callback_t -enums: - - name: PTHREAD_CREATE_JOINABLE - value: 0x0 - - name: PTHREAD_CREATE_DETACHED - value: 0x1 - - name: PTHREAD_MUTEX_NORMAL - value: 0x0 - - name: PTHREAD_MUTEX_ERRORCHECK - value: 0x1 - - name: PTHREAD_MUTEX_RECURSIVE - value: 0x2 - - name: PTHREAD_MUTEX_DEFAULT - value: 0x0 - - name: PTHREAD_PROCESS_PRIVATE - value: 0x0 - - name: PTHREAD_PROCESS_SHARED - value: 0x1 - - name: PTHREAD_MUTEX_STALLED - value: 0x0 - - name: PTHREAD_MUTEX_ROBUST - value: 0x1 - - name: PTHREAD_RWLOCK_PREFER_READER_NP - value: 0 - - name: PTHREAD_RWLOCK_PREFER_WRITER_NP - value: 1 - - name: PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP - value: 2 + - type_name: pthread_rwlock_t +enums: [] functions: - name: pthread_atfork standards: @@ -184,7 +159,7 @@ functions: - name: pthread_exit standards: - POSIX - return_type: __Noreturn void + return_type: _Noreturn void arguments: - type: void * - name: pthread_getname_np diff --git a/libc/newhdrgen/yaml/sys/sys_random.yaml b/libc/newhdrgen/yaml/sys/sys_random.yaml index 233fb2c7988cb9..6d84056d7dd71d 100644 --- a/libc/newhdrgen/yaml/sys/sys_random.yaml +++ b/libc/newhdrgen/yaml/sys/sys_random.yaml @@ -4,7 +4,7 @@ types: - type_name: ssize_t - type_name: size_t enums: [] -objects: +objects: [] functions: - name: getrandom standards: diff --git a/libc/newhdrgen/yaml/time.yaml b/libc/newhdrgen/yaml/time.yaml index e7f8de65eeb754..220d4328dbbdbb 100644 --- a/libc/newhdrgen/yaml/time.yaml +++ b/libc/newhdrgen/yaml/time.yaml @@ -15,7 +15,7 @@ functions: - stdc return_type: char * arguments: - - type: struct tm * + - type: const struct tm * - name: asctime_r standard: - stdc diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py index 6bccda8e036409..205bb35fe691a6 100644 --- a/libc/newhdrgen/yaml_to_classes.py +++ b/libc/newhdrgen/yaml_to_classes.py @@ -8,12 +8,11 @@ # # ==-------------------------------------------------------------------------==# - import yaml import argparse - from pathlib import Path from header import HeaderFile +from gpu_headers import GpuHeaderFile as GpuHeader from class_implementation.classes.macro import Macro from class_implementation.classes.type import Type from class_implementation.classes.function import Function @@ -22,18 +21,20 @@ from class_implementation.classes.object import Object -def yaml_to_classes(yaml_data): +def yaml_to_classes(yaml_data, header_class, entry_points=None): """ Convert YAML data to header classes. Args: yaml_data: The YAML data containing header specifications. + header_class: The class to use for creating the header. + entry_points: A list of specific function names to include in the header. Returns: HeaderFile: An instance of HeaderFile populated with the data. """ header_name = yaml_data.get("header") - header = HeaderFile(header_name) + header = header_class(header_name) for macro_data in yaml_data.get("macros", []): header.add_macro(Macro(macro_data["macro_name"], macro_data["macro_value"])) @@ -49,12 +50,15 @@ def yaml_to_classes(yaml_data): ) functions = yaml_data.get("functions", []) + if entry_points: + entry_points_set = set(entry_points) + functions = [f for f in functions if f["name"] in entry_points_set] sorted_functions = sorted(functions, key=lambda x: x["name"]) guards = [] guarded_function_dict = {} for function_data in sorted_functions: guard = function_data.get("guard", None) - if guard == None: + if guard is None: arguments = [arg["type"] for arg in function_data["arguments"]] attributes = function_data.get("attributes", None) standards = function_data.get("standards", None) @@ -105,19 +109,21 @@ def yaml_to_classes(yaml_data): return header -def load_yaml_file(yaml_file): +def load_yaml_file(yaml_file, header_class, entry_points): """ Load YAML file and convert it to header classes. Args: - yaml_file: The path to the YAML file. + yaml_file: Path to the YAML file. + header_class: The class to use for creating the header (HeaderFile or GpuHeader). + entry_points: A list of specific function names to include in the header. Returns: - HeaderFile: An instance of HeaderFile populated with the data from the YAML file. + HeaderFile: An instance of HeaderFile populated with the data. """ with open(yaml_file, "r") as f: yaml_data = yaml.safe_load(f) - return yaml_to_classes(yaml_data) + return yaml_to_classes(yaml_data, header_class, entry_points) def fill_public_api(header_str, h_def_content): @@ -207,7 +213,14 @@ def increase_indent(self, flow=False, indentless=False): print(f"Added function {new_function.name} to {yaml_file}") -def main(yaml_file, h_def_file, output_dir, add_function=None): +def main( + yaml_file, + output_dir=None, + h_def_file=None, + add_function=None, + entry_points=None, + export_decls=False, +): """ Main function to generate header files from YAML and .h.def templates. @@ -216,41 +229,50 @@ def main(yaml_file, h_def_file, output_dir, add_function=None): h_def_file: Path to the .h.def template file. output_dir: Directory to output the generated header file. add_function: Details of the function to be added to the YAML file (if any). + entry_points: A list of specific function names to include in the header. + export_decls: Flag to use GpuHeader for exporting declarations. """ - if add_function: add_function_to_yaml(yaml_file, add_function) - header = load_yaml_file(yaml_file) - - with open(h_def_file, "r") as f: - h_def_content = f.read() + header_class = GpuHeader if export_decls else HeaderFile + header = load_yaml_file(yaml_file, header_class, entry_points) header_str = str(header) - final_header_content = fill_public_api(header_str, h_def_content) - output_file_name = Path(h_def_file).stem - output_file_path = Path(output_dir) / output_file_name - - with open(output_file_path, "w") as f: - f.write(final_header_content) + if output_dir: + output_file_path = Path(output_dir) + if output_file_path.is_dir(): + output_file_path /= f"{Path(yaml_file).stem}.h" + else: + output_file_path = Path(f"{Path(yaml_file).stem}.h") + + if not export_decls and h_def_file: + with open(h_def_file, "r") as f: + h_def_content = f.read() + final_header_content = fill_public_api(header_str, h_def_content) + with open(output_file_path, "w") as f: + f.write(final_header_content) + else: + with open(output_file_path, "w") as f: + f.write(header_str) print(f"Generated header file: {output_file_path}") if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Generate header files from YAML and .h.def templates" - ) + parser = argparse.ArgumentParser(description="Generate header files from YAML") parser.add_argument( "yaml_file", help="Path to the YAML file containing header specification" ) - parser.add_argument("h_def_file", help="Path to the .h.def template file") parser.add_argument( "--output_dir", - default=".", help="Directory to output the generated header file", ) + parser.add_argument( + "--h_def_file", + help="Path to the .h.def template file (required if not using --export_decls)", + ) parser.add_argument( "--add_function", nargs=6, @@ -264,6 +286,21 @@ def main(yaml_file, h_def_file, output_dir, add_function=None): ), help="Add a function to the YAML file", ) + parser.add_argument( + "--e", action="append", help="Entry point to include", dest="entry_points" + ) + parser.add_argument( + "--export-decls", + action="store_true", + help="Flag to use GpuHeader for exporting declarations", + ) args = parser.parse_args() - main(args.yaml_file, args.h_def_file, args.output_dir, args.add_function) + main( + args.yaml_file, + args.output_dir, + args.h_def_file, + args.add_function, + args.entry_points, + args.export_decls, + ) diff --git a/libc/src/__support/OSUtil/baremetal/io.cpp b/libc/src/__support/OSUtil/baremetal/io.cpp index de278eb0092f81..2a9ef6bfa65799 100644 --- a/libc/src/__support/OSUtil/baremetal/io.cpp +++ b/libc/src/__support/OSUtil/baremetal/io.cpp @@ -13,20 +13,58 @@ namespace LIBC_NAMESPACE_DECL { -// This is intended to be provided by the vendor. +// These are intended to be provided by the vendor. +// +// The signature of these types and functions intentionally match `fopencookie` +// which allows the following: +// +// ``` +// struct __llvm_libc_stdio_cookie { ... }; +// ... +// struct __llvm_libc_stdio_cookie __llvm_libc_stdin_cookie; +// cookie_io_functions_t stdin_func = { .read = __llvm_libc_stdio_read }; +// FILE *stdin = fopencookie(&__llvm_libc_stdin_cookie, "r", stdin_func); +// ... +// struct __llvm_libc_stdio_cookie __llvm_libc_stdout_cookie; +// cookie_io_functions_t stdout_func = { .write = __llvm_libc_stdio_write }; +// FILE *stdout = fopencookie(&__llvm_libc_stdout_cookie, "w", stdout_func); +// ... +// struct __llvm_libc_stdio_cookie __llvm_libc_stderr_cookie; +// cookie_io_functions_t stderr_func = { .write = __llvm_libc_stdio_write }; +// FILE *stderr = fopencookie(&__llvm_libc_stderr_cookie, "w", stderr_func); +// ``` +// +// At the same time, implementation of functions like `printf` and `scanf` can +// use `__llvm_libc_stdio_read` and `__llvm_libc_stdio_write` directly to avoid +// the extra indirection. +// +// All three symbols `__llvm_libc_stdin_cookie`, `__llvm_libc_stdout_cookie`, +// and `__llvm_libc_stderr_cookie` must be provided, even if they don't point +// at anything. -extern struct __llvm_libc_stdin __llvm_libc_stdin; -extern "C" ssize_t __llvm_libc_stdin_read(void *cookie, char *buf, size_t size); +struct __llvm_libc_stdio_cookie; -extern "C" void __llvm_libc_log_write(const char *msg, size_t len); +extern "C" struct __llvm_libc_stdio_cookie __llvm_libc_stdin_cookie; +extern "C" struct __llvm_libc_stdio_cookie __llvm_libc_stdout_cookie; +extern "C" struct __llvm_libc_stdio_cookie __llvm_libc_stderr_cookie; + +extern "C" ssize_t __llvm_libc_stdio_read(void *cookie, char *buf, size_t size); +extern "C" ssize_t __llvm_libc_stdio_write(void *cookie, const char *buf, + size_t size); ssize_t read_from_stdin(char *buf, size_t size) { - return __llvm_libc_stdin_read(reinterpret_cast(&__llvm_libc_stdin), + return __llvm_libc_stdio_read(static_cast(&__llvm_libc_stdin_cookie), buf, size); } +void write_to_stdout(cpp::string_view msg) { + __llvm_libc_stdio_write(static_cast(&__llvm_libc_stdout_cookie), + msg.data(), msg.size()); +} + void write_to_stderr(cpp::string_view msg) { - __llvm_libc_log_write(msg.data(), msg.size()); + __llvm_libc_stdio_write(static_cast(&__llvm_libc_stderr_cookie), + msg.data(), msg.size()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/baremetal/io.h b/libc/src/__support/OSUtil/baremetal/io.h index 92bf3db6b3fd14..aed34ec7e62e3f 100644 --- a/libc/src/__support/OSUtil/baremetal/io.h +++ b/libc/src/__support/OSUtil/baremetal/io.h @@ -18,6 +18,7 @@ namespace LIBC_NAMESPACE_DECL { ssize_t read_from_stdin(char *buf, size_t size); void write_to_stderr(cpp::string_view msg); +void write_to_stdout(cpp::string_view msg); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt index 9ea0b59befe7ac..d2e46b8e2574e9 100644 --- a/libc/src/__support/threads/CMakeLists.txt +++ b/libc/src/__support/threads/CMakeLists.txt @@ -10,6 +10,15 @@ add_header_library( sleep.h ) +add_header_library( + spin_lock + HDRS + spin_lock.h + DEPENDS + .sleep + libc.src.__support.CPP.atomic +) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${LIBC_TARGET_OS}) endif() diff --git a/libc/src/__support/threads/spin_lock.h b/libc/src/__support/threads/spin_lock.h new file mode 100644 index 00000000000000..8a365505684644 --- /dev/null +++ b/libc/src/__support/threads/spin_lock.h @@ -0,0 +1,81 @@ +//===-- TTAS Spin Lock ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H +#define LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H + +#include "src/__support/CPP/atomic.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/__support/threads/sleep.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace spinlock { +template +using AtomicOp = Return (cpp::Atomic::*)(LockWord, cpp::MemoryOrder, + cpp::MemoryScope); +} + +template Acquire, + spinlock::AtomicOp Release> +class SpinLockAdaptor { + cpp::Atomic flag; + +public: + LIBC_INLINE constexpr SpinLockAdaptor() : flag{false} {} + LIBC_INLINE bool try_lock() { + return !flag.*Acquire(static_cast(1), cpp::MemoryOrder::ACQUIRE); + } + LIBC_INLINE void lock() { + // clang-format off + // For normal TTAS, this compiles to the following on armv9a and x86_64: + // mov w8, #1 | .LBB0_1: + // .LBB0_1: | mov al, 1 + // swpab w8, w9, [x0] | xchg byte ptr [rdi], al + // tbnz w9, #0, .LBB0_3 | test al, 1 + // b .LBB0_4 | jne .LBB0_3 + // .LBB0_2: | jmp .LBB0_4 + // isb | .LBB0_2: + // .LBB0_3: | pause + // ldrb w9, [x0] | .LBB0_3: + // tbnz w9, #0, .LBB0_2 | movzx eax, byte ptr [rdi] + // b .LBB0_1 | test al, 1 + // .LBB0_4: | jne .LBB0_2 + // ret | jmp .LBB0_1 + // | .LBB0_4: + // | ret + // clang-format on + // Notice that inside the busy loop .LBB0_2 and .LBB0_3, only instructions + // with load semantics are used. swpab/xchg is only issued in outer loop + // .LBB0_1. This is useful to avoid extra write traffic. The cache + // coherence guarantees "write propagation", so even if the inner loop only + // reads with relaxed ordering, the thread will evetually see the write. + while (!try_lock()) + while (flag.load(cpp::MemoryOrder::RELAXED)) + sleep_briefly(); + } + LIBC_INLINE void unlock() { + flag.*Release(static_cast(0), cpp::MemoryOrder::RELEASE); + } +}; + +// It is reported that atomic operations with higher-order semantics +// lead to better performance on GPUs. +#ifdef LIBC_TARGET_ARCH_IS_GPU +using SpinLock = + SpinLockAdaptor::fetch_or, + &cpp::Atomic::fetch_and>; +#else +using SpinLock = SpinLockAdaptor::exchange, + &cpp::Atomic::store>; +#endif + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_SPIN_LOCK_H diff --git a/libc/src/math/aarch64/CMakeLists.txt b/libc/src/math/aarch64/CMakeLists.txt deleted file mode 100644 index bbe927a1c7c889..00000000000000 --- a/libc/src/math/aarch64/CMakeLists.txt +++ /dev/null @@ -1,79 +0,0 @@ -add_entrypoint_object( - ceil - SRCS - ceil.cpp - HDRS - ../ceil.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - ceilf - SRCS - ceilf.cpp - HDRS - ../ceilf.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - floor - SRCS - floor.cpp - HDRS - ../floor.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - floorf - SRCS - floorf.cpp - HDRS - ../floorf.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - trunc - SRCS - trunc.cpp - HDRS - ../trunc.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - truncf - SRCS - truncf.cpp - HDRS - ../truncf.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - round - SRCS - round.cpp - HDRS - ../round.h - COMPILE_OPTIONS - -O2 -) - -add_entrypoint_object( - roundf - SRCS - roundf.cpp - HDRS - ../roundf.h - COMPILE_OPTIONS - -O2 -) diff --git a/libc/src/math/aarch64/ceil.cpp b/libc/src/math/aarch64/ceil.cpp deleted file mode 100644 index 5bfd053d603dec..00000000000000 --- a/libc/src/math/aarch64/ceil.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the ceil function for aarch64 -------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/ceil.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(double, ceil, (double x)) { - double y; - __asm__ __volatile__("frintp %d0, %d1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/ceilf.cpp b/libc/src/math/aarch64/ceilf.cpp deleted file mode 100644 index 2352245bc8303c..00000000000000 --- a/libc/src/math/aarch64/ceilf.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the ceilf function for aarch64 ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/ceilf.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { - float y; - __asm__ __volatile__("frintp %s0, %s1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/floor.cpp b/libc/src/math/aarch64/floor.cpp deleted file mode 100644 index f9da52bdd45a85..00000000000000 --- a/libc/src/math/aarch64/floor.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the floor function for aarch64 ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/floor.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(double, floor, (double x)) { - double y; - __asm__ __volatile__("frintm %d0, %d1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/floorf.cpp b/libc/src/math/aarch64/floorf.cpp deleted file mode 100644 index 980b3c52a00b53..00000000000000 --- a/libc/src/math/aarch64/floorf.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the floorf function for aarch64 -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/floorf.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(float, floorf, (float x)) { - float y; - __asm__ __volatile__("frintm %s0, %s1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/round.cpp b/libc/src/math/aarch64/round.cpp deleted file mode 100644 index c85445aa19ab50..00000000000000 --- a/libc/src/math/aarch64/round.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the round function for aarch64 ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/round.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(double, round, (double x)) { - double y; - __asm__ __volatile__("frinta %d0, %d1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/roundf.cpp b/libc/src/math/aarch64/roundf.cpp deleted file mode 100644 index 0c7f7640e8278c..00000000000000 --- a/libc/src/math/aarch64/roundf.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the roundf function for aarch64 -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/roundf.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(float, roundf, (float x)) { - float y; - __asm__ __volatile__("frinta %s0, %s1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/trunc.cpp b/libc/src/math/aarch64/trunc.cpp deleted file mode 100644 index 1ef26f44fd234e..00000000000000 --- a/libc/src/math/aarch64/trunc.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the trunc function for aarch64 ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/trunc.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(double, trunc, (double x)) { - double y; - __asm__ __volatile__("frintz %d0, %d1\n" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/aarch64/truncf.cpp b/libc/src/math/aarch64/truncf.cpp deleted file mode 100644 index 0c64ef60e91c4d..00000000000000 --- a/libc/src/math/aarch64/truncf.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of the truncf function for aarch64 -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/math/truncf.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(float, truncf, (float x)) { - float y; - __asm__ __volatile__("frintz %s0, %s1\n\t" : "=w"(y) : "w"(x)); - return y; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 5e920307d39de4..c2f58fb1a4f71a 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -70,6 +70,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -82,6 +84,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -107,6 +111,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -455,6 +462,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -467,6 +476,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -492,6 +503,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -517,6 +531,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -529,6 +545,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -554,6 +572,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -579,6 +600,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -591,6 +614,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -616,6 +641,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -641,6 +669,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -653,6 +683,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -678,6 +710,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -827,6 +862,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -839,6 +876,8 @@ add_entrypoint_object( -O3 DEPENDS libc.src.__support.FPUtil.nearest_integer_operations + FLAGS + ROUND_OPT ) add_entrypoint_object( @@ -864,6 +903,9 @@ add_entrypoint_object( DEPENDS libc.src.__support.macros.properties.types libc.src.__support.FPUtil.nearest_integer_operations + libc.src.__support.macros.properties.architectures + FLAGS + ROUND_OPT ) add_entrypoint_object( diff --git a/libc/src/math/generic/ceil.cpp b/libc/src/math/generic/ceil.cpp index a5ac1348834d8d..72c6e990fcc711 100644 --- a/libc/src/math/generic/ceil.cpp +++ b/libc/src/math/generic/ceil.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(double, ceil, (double x)) { return fputil::ceil(x); } +LLVM_LIBC_FUNCTION(double, ceil, (double x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_ceil(x); +#else + return fputil::ceil(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/ceilf.cpp b/libc/src/math/generic/ceilf.cpp index 0fd54361648dcd..dfd0dc62bc51b2 100644 --- a/libc/src/math/generic/ceilf.cpp +++ b/libc/src/math/generic/ceilf.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { return fputil::ceil(x); } +LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_ceilf(x); +#else + return fputil::ceil(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp index 1d17daf283d07e..708bc4cfd4860c 100644 --- a/libc/src/math/generic/ceilf16.cpp +++ b/libc/src/math/generic/ceilf16.cpp @@ -10,9 +10,17 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) { return fputil::ceil(x); } +LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ + defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_ceilf(x)); +#else + return fputil::ceil(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/floor.cpp b/libc/src/math/generic/floor.cpp index 417d6aca3ffe54..86aed6c61a7cca 100644 --- a/libc/src/math/generic/floor.cpp +++ b/libc/src/math/generic/floor.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(double, floor, (double x)) { return fputil::floor(x); } +LLVM_LIBC_FUNCTION(double, floor, (double x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_floor(x); +#else + return fputil::floor(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/floorf.cpp b/libc/src/math/generic/floorf.cpp index ca059e6d02d53a..22739eff68ec2d 100644 --- a/libc/src/math/generic/floorf.cpp +++ b/libc/src/math/generic/floorf.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float, floorf, (float x)) { return fputil::floor(x); } +LLVM_LIBC_FUNCTION(float, floorf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_floorf(x); +#else + return fputil::floor(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp index 46068c2a548a93..84e4b0730ac689 100644 --- a/libc/src/math/generic/floorf16.cpp +++ b/libc/src/math/generic/floorf16.cpp @@ -10,9 +10,17 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) { return fputil::floor(x); } +LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ + defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_floorf(x)); +#else + return fputil::floor(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/rint.cpp b/libc/src/math/generic/rint.cpp index f7837df1334840..5defa60ddac1c8 100644 --- a/libc/src/math/generic/rint.cpp +++ b/libc/src/math/generic/rint.cpp @@ -14,7 +14,11 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(double, rint, (double x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_rint(x); +#else return fputil::round_using_current_rounding_mode(x); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/rintf.cpp b/libc/src/math/generic/rintf.cpp index 29a2845cc85d4f..2fe7788241168e 100644 --- a/libc/src/math/generic/rintf.cpp +++ b/libc/src/math/generic/rintf.cpp @@ -14,7 +14,11 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, rintf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_rintf(x); +#else return fputil::round_using_current_rounding_mode(x); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp index 69b89de31bf403..0e8c091efcf9b9 100644 --- a/libc/src/math/generic/rintf16.cpp +++ b/libc/src/math/generic/rintf16.cpp @@ -10,11 +10,17 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ + defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_rintf(x)); +#else return fputil::round_using_current_rounding_mode(x); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/round.cpp b/libc/src/math/generic/round.cpp index d8b171a2ef092b..6ed5be50659629 100644 --- a/libc/src/math/generic/round.cpp +++ b/libc/src/math/generic/round.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(double, round, (double x)) { return fputil::round(x); } +LLVM_LIBC_FUNCTION(double, round, (double x)) { +#ifdef __LIBC_USE_BUILTIN_ROUND + return __builtin_round(x); +#else + return fputil::round(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/roundeven.cpp b/libc/src/math/generic/roundeven.cpp index e6f599991bc0f7..42f19f38b53856 100644 --- a/libc/src/math/generic/roundeven.cpp +++ b/libc/src/math/generic/roundeven.cpp @@ -14,7 +14,11 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(double, roundeven, (double x)) { +#ifdef __LIBC_USE_BUILTIN_ROUNDEVEN + return __builtin_roundeven(x); +#else return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/roundevenf.cpp b/libc/src/math/generic/roundevenf.cpp index 0b63a093edf6c9..98bdc6545d94e6 100644 --- a/libc/src/math/generic/roundevenf.cpp +++ b/libc/src/math/generic/roundevenf.cpp @@ -14,7 +14,11 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, roundevenf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_ROUNDEVEN + return __builtin_roundevenf(x); +#else return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp index 8a27d81b895ae9..b45670bd24ff1c 100644 --- a/libc/src/math/generic/roundevenf16.cpp +++ b/libc/src/math/generic/roundevenf16.cpp @@ -10,11 +10,17 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) && \ + defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_roundevenf(x)); +#else return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST); +#endif } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/roundf.cpp b/libc/src/math/generic/roundf.cpp index 68be4b1a519876..d25f7128cb9cea 100644 --- a/libc/src/math/generic/roundf.cpp +++ b/libc/src/math/generic/roundf.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float, roundf, (float x)) { return fputil::round(x); } +LLVM_LIBC_FUNCTION(float, roundf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_ROUND + return __builtin_roundf(x); +#else + return fputil::round(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp index 06f9a79cc21f32..cb668c0e763886 100644 --- a/libc/src/math/generic/roundf16.cpp +++ b/libc/src/math/generic/roundf16.cpp @@ -10,9 +10,16 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) { return fputil::round(x); } +LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_roundf(x)); +#else + return fputil::round(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/trunc.cpp b/libc/src/math/generic/trunc.cpp index 0bf2ac5962e7b5..603750f11c9f6f 100644 --- a/libc/src/math/generic/trunc.cpp +++ b/libc/src/math/generic/trunc.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(double, trunc, (double x)) { return fputil::trunc(x); } +LLVM_LIBC_FUNCTION(double, trunc, (double x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_trunc(x); +#else + return fputil::trunc(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/truncf.cpp b/libc/src/math/generic/truncf.cpp index 371cb9f8229a6c..d7b0ffd96da9c5 100644 --- a/libc/src/math/generic/truncf.cpp +++ b/libc/src/math/generic/truncf.cpp @@ -13,6 +13,12 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float, truncf, (float x)) { return fputil::trunc(x); } +LLVM_LIBC_FUNCTION(float, truncf, (float x)) { +#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC + return __builtin_truncf(x); +#else + return fputil::trunc(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp index ea7695dc95f0bb..b931053e534385 100644 --- a/libc/src/math/generic/truncf16.cpp +++ b/libc/src/math/generic/truncf16.cpp @@ -10,9 +10,17 @@ #include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) { return fputil::trunc(x); } +LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) { +#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) && \ + defined(LIBC_TARGET_ARCH_IS_AARCH64) + return static_cast(__builtin_truncf(x)); +#else + return fputil::trunc(x); +#endif +} } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index 7b7e55db391fae..513f6ad723d564 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -258,9 +258,13 @@ add_entrypoint_object( add_header_library( qsort_util HDRS + qsort_data.h qsort_util.h + heap_sort.h + quick_sort.h DEPENDS libc.include.stdlib + libc.src.__support.CPP.cstddef ) add_entrypoint_object( diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h new file mode 100644 index 00000000000000..ccb9ec5f82149e --- /dev/null +++ b/libc/src/stdlib/heap_sort.h @@ -0,0 +1,61 @@ +//===-- Implementation of heap sort -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_HEAP_SORT_H +#define LLVM_LIBC_SRC_STDLIB_HEAP_SORT_H + +#include "src/__support/CPP/cstddef.h" +#include "src/stdlib/qsort_data.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// A simple in-place heapsort implementation. +// Follow the implementation in https://en.wikipedia.org/wiki/Heapsort. + +LIBC_INLINE void heap_sort(const Array &array) { + size_t end = array.size(); + size_t start = end / 2; + + auto left_child = [](size_t i) -> size_t { return 2 * i + 1; }; + + while (end > 1) { + if (start > 0) { + // Select the next unheapified element to sift down. + --start; + } else { + // Extract the max element of the heap, moving a leaf to root to be sifted + // down. + --end; + array.swap(0, end); + } + + // Sift start down the heap. + size_t root = start; + while (left_child(root) < end) { + size_t child = left_child(root); + // If there are two children, set child to the greater. + if (child + 1 < end && + array.elem_compare(child, array.get(child + 1)) < 0) + ++child; + + // If the root is less than the greater child + if (array.elem_compare(root, array.get(child)) >= 0) + break; + + // Swap the root with the greater child and continue sifting down. + array.swap(root, child); + root = child; + } + } +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_HEAP_SORT_H diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp index 048e63ab214ed3..65a63c239f5c0d 100644 --- a/libc/src/stdlib/qsort.cpp +++ b/libc/src/stdlib/qsort.cpp @@ -21,8 +21,11 @@ LLVM_LIBC_FUNCTION(void, qsort, if (array == nullptr || array_size == 0 || elem_size == 0) return; internal::Comparator c(compare); - internal::quicksort(internal::Array(reinterpret_cast(array), - array_size, elem_size, c)); + + auto arr = internal::Array(reinterpret_cast(array), array_size, + elem_size, c); + + internal::sort(arr); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h new file mode 100644 index 00000000000000..db045332708ae6 --- /dev/null +++ b/libc/src/stdlib/qsort_data.h @@ -0,0 +1,102 @@ +//===-- Data structures for sorting routines --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_DATA_H +#define LLVM_LIBC_SRC_STDLIB_QSORT_DATA_H + +#include "src/__support/CPP/cstddef.h" +#include "src/__support/macros/config.h" + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +using Compare = int(const void *, const void *); +using CompareWithState = int(const void *, const void *, void *); + +enum class CompType { COMPARE, COMPARE_WITH_STATE }; + +struct Comparator { + union { + Compare *comp_func; + CompareWithState *comp_func_r; + }; + const CompType comp_type; + + void *arg; + + Comparator(Compare *func) + : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {} + + Comparator(CompareWithState *func, void *arg_val) + : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE), + arg(arg_val) {} + +#if defined(__clang__) + // Recent upstream changes to -fsanitize=function find more instances of + // function type mismatches. One case is with the comparator passed to this + // class. Libraries will tend to pass comparators that take pointers to + // varying types while this comparator expects to accept const void pointers. + // Ideally those tools would pass a function that strictly accepts const + // void*s to avoid UB, or would use qsort_r to pass their own comparator. + [[clang::no_sanitize("function")]] +#endif + int comp_vals(const void *a, const void *b) const { + if (comp_type == CompType::COMPARE) { + return comp_func(a, b); + } else { + return comp_func_r(a, b, arg); + } + } +}; + +class Array { + uint8_t *array; + size_t array_size; + size_t elem_size; + Comparator compare; + +public: + Array(uint8_t *a, size_t s, size_t e, Comparator c) + : array(a), array_size(s), elem_size(e), compare(c) {} + + uint8_t *get(size_t i) const { return array + i * elem_size; } + + void swap(size_t i, size_t j) const { + uint8_t *elem_i = get(i); + uint8_t *elem_j = get(j); + for (size_t b = 0; b < elem_size; ++b) { + uint8_t temp = elem_i[b]; + elem_i[b] = elem_j[b]; + elem_j[b] = temp; + } + } + + int elem_compare(size_t i, const uint8_t *other) const { + // An element must compare equal to itself so we don't need to consult the + // user provided comparator. + if (get(i) == other) + return 0; + return compare.comp_vals(get(i), other); + } + + size_t size() const { return array_size; } + + // Make an Array starting at index |i| and size |s|. + Array make_array(size_t i, size_t s) const { + return Array(get(i), s, elem_size, compare); + } +}; + +using SortingRoutine = void(const Array &); + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_QSORT_DATA_H diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp index efbe5ad484b0ea..bf61a40e847341 100644 --- a/libc/src/stdlib/qsort_r.cpp +++ b/libc/src/stdlib/qsort_r.cpp @@ -22,8 +22,10 @@ LLVM_LIBC_FUNCTION(void, qsort_r, if (array == nullptr || array_size == 0 || elem_size == 0) return; internal::Comparator c(compare, arg); - internal::quicksort(internal::Array(reinterpret_cast(array), - array_size, elem_size, c)); + auto arr = internal::Array(reinterpret_cast(array), array_size, + elem_size, c); + + internal::sort(arr); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h index 3a9cc4b8669f86..d42adde06d9762 100644 --- a/libc/src/stdlib/qsort_util.h +++ b/libc/src/stdlib/qsort_util.h @@ -9,145 +9,29 @@ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_UTIL_H #define LLVM_LIBC_SRC_STDLIB_QSORT_UTIL_H -#include "src/__support/macros/attributes.h" -#include "src/__support/macros/config.h" -#include -#include +#include "src/stdlib/heap_sort.h" +#include "src/stdlib/quick_sort.h" -namespace LIBC_NAMESPACE_DECL { -namespace internal { - -// A simple quicksort implementation using the Hoare partition scheme. - -using Compare = int(const void *, const void *); -using CompareWithState = int(const void *, const void *, void *); - -enum class CompType { COMPARE, COMPARE_WITH_STATE }; - -struct Comparator { - union { - Compare *comp_func; - CompareWithState *comp_func_r; - }; - const CompType comp_type; +#define LIBC_QSORT_QUICK_SORT 1 +#define LIBC_QSORT_HEAP_SORT 2 - void *arg; +#ifndef LIBC_QSORT_IMPL +#define LIBC_QSORT_IMPL LIBC_QSORT_QUICK_SORT +#endif // LIBC_QSORT_IMPL - Comparator(Compare *func) - : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {} - - Comparator(CompareWithState *func, void *arg_val) - : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE), - arg(arg_val) {} - -#if defined(__clang__) - // Recent upstream changes to -fsanitize=function find more instances of - // function type mismatches. One case is with the comparator passed to this - // class. Libraries will tend to pass comparators that take pointers to - // varying types while this comparator expects to accept const void pointers. - // Ideally those tools would pass a function that strictly accepts const - // void*s to avoid UB, or would use qsort_r to pass their own comparator. - [[clang::no_sanitize("function")]] +#if (LIBC_QSORT_IMPL != LIBC_QSORT_QUICK_SORT && \ + LIBC_QSORT_IMPL != LIBC_QSORT_HEAP_SORT) +#error "LIBC_QSORT_IMPL is not recognized." #endif - int comp_vals(const void *a, const void *b) const { - if (comp_type == CompType::COMPARE) { - return comp_func(a, b); - } else { - return comp_func_r(a, b, arg); - } - } -}; - -class Array { - uint8_t *array; - size_t array_size; - size_t elem_size; - Comparator compare; - -public: - Array(uint8_t *a, size_t s, size_t e, Comparator c) - : array(a), array_size(s), elem_size(e), compare(c) {} - - uint8_t *get(size_t i) const { return array + i * elem_size; } - - void swap(size_t i, size_t j) const { - uint8_t *elem_i = get(i); - uint8_t *elem_j = get(j); - for (size_t b = 0; b < elem_size; ++b) { - uint8_t temp = elem_i[b]; - elem_i[b] = elem_j[b]; - elem_j[b] = temp; - } - } - int elem_compare(size_t i, const uint8_t *other) const { - // An element must compare equal to itself so we don't need to consult the - // user provided comparator. - if (get(i) == other) - return 0; - return compare.comp_vals(get(i), other); - } - - size_t size() const { return array_size; } - - // Make an Array starting at index |i| and size |s|. - Array make_array(size_t i, size_t s) const { - return Array(get(i), s, elem_size, compare); - } -}; - -static size_t partition(const Array &array) { - const size_t array_size = array.size(); - size_t pivot_index = array_size / 2; - uint8_t *pivot = array.get(pivot_index); - size_t i = 0; - size_t j = array_size - 1; - - while (true) { - int compare_i, compare_j; - - while ((compare_i = array.elem_compare(i, pivot)) < 0) - ++i; - while ((compare_j = array.elem_compare(j, pivot)) > 0) - --j; - - // At some point i will crossover j so we will definitely break out of - // this while loop. - if (i >= j) - return j + 1; - - array.swap(i, j); - - // The pivot itself might have got swapped so we will update the pivot. - if (i == pivot_index) { - pivot = array.get(j); - pivot_index = j; - } else if (j == pivot_index) { - pivot = array.get(i); - pivot_index = i; - } - - if (compare_i == 0 && compare_j == 0) { - // If we do not move the pointers, we will end up with an - // infinite loop as i and j will be stuck without advancing. - ++i; - --j; - } - } -} +namespace LIBC_NAMESPACE_DECL { +namespace internal { -LIBC_INLINE void quicksort(const Array &array) { - const size_t array_size = array.size(); - if (array_size <= 1) - return; - size_t split_index = partition(array); - if (array_size <= 2) { - // The partition operation sorts the two element array. - return; - } - quicksort(array.make_array(0, split_index)); - quicksort(array.make_array(split_index, array.size() - split_index)); -} +#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT +constexpr auto sort = quick_sort; +#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT +constexpr auto sort = heap_sort; +#endif } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h new file mode 100644 index 00000000000000..89ec107161e3e5 --- /dev/null +++ b/libc/src/stdlib/quick_sort.h @@ -0,0 +1,78 @@ +//===-- Implementation header for qsort utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H +#define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H + +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" +#include "src/stdlib/qsort_data.h" + +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +// A simple quicksort implementation using the Hoare partition scheme. +static size_t partition(const Array &array) { + const size_t array_size = array.size(); + size_t pivot_index = array_size / 2; + uint8_t *pivot = array.get(pivot_index); + size_t i = 0; + size_t j = array_size - 1; + + while (true) { + int compare_i, compare_j; + + while ((compare_i = array.elem_compare(i, pivot)) < 0) + ++i; + while ((compare_j = array.elem_compare(j, pivot)) > 0) + --j; + + // At some point i will crossover j so we will definitely break out of + // this while loop. + if (i >= j) + return j + 1; + + array.swap(i, j); + + // The pivot itself might have got swapped so we will update the pivot. + if (i == pivot_index) { + pivot = array.get(j); + pivot_index = j; + } else if (j == pivot_index) { + pivot = array.get(i); + pivot_index = i; + } + + if (compare_i == 0 && compare_j == 0) { + // If we do not move the pointers, we will end up with an + // infinite loop as i and j will be stuck without advancing. + ++i; + --j; + } + } +} + +LIBC_INLINE void quick_sort(const Array &array) { + const size_t array_size = array.size(); + if (array_size <= 1) + return; + size_t split_index = partition(array); + if (array_size <= 2) { + // The partition operation sorts the two element array. + return; + } + quick_sort(array.make_array(0, split_index)); + quick_sort(array.make_array(split_index, array.size() - split_index)); +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp index 72aeaf20e1dac4..ad5722f99a4369 100644 --- a/libc/test/UnitTest/LibcTest.cpp +++ b/libc/test/UnitTest/LibcTest.cpp @@ -159,13 +159,13 @@ int Test::runTests(const TestOptions &Options) { } tlog << green << "[ RUN ] " << reset << TestName << '\n'; - [[maybe_unused]] const auto start_time = clock(); + [[maybe_unused]] const uint64_t start_time = clock(); RunContext Ctx; T->SetUp(); T->setContext(&Ctx); T->Run(); T->TearDown(); - [[maybe_unused]] const auto end_time = clock(); + [[maybe_unused]] const uint64_t end_time = clock(); switch (Ctx.status()) { case RunContext::RunResult::Fail: tlog << red << "[ FAILED ] " << reset << TestName << '\n'; diff --git a/libc/test/src/__support/CPP/type_traits_test.cpp b/libc/test/src/__support/CPP/type_traits_test.cpp index 3c6268f86fbd12..fa5298a12d3fc7 100644 --- a/libc/test/src/__support/CPP/type_traits_test.cpp +++ b/libc/test/src/__support/CPP/type_traits_test.cpp @@ -119,7 +119,7 @@ TEST(LlvmLibcTypeTraitsTest, aligned_storage) { int a, b; }; aligned_storage_t buf; - EXPECT_EQ(alignof(buf), alignof(S)); + EXPECT_EQ(alignof(decltype(buf)), alignof(S)); EXPECT_EQ(sizeof(buf), sizeof(S)); } diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp index af20b1a0bdc7ef..99acc03010344f 100644 --- a/libc/test/src/__support/FPUtil/fpbits_test.cpp +++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp @@ -238,6 +238,7 @@ template constexpr auto make(Sign sign, FP fp) { case FP::QUIET_NAN: return T::quiet_nan(sign); } + __builtin_unreachable(); } // Tests all properties for all types of float. diff --git a/libc/test/src/__support/big_int_test.cpp b/libc/test/src/__support/big_int_test.cpp index 2c3d57755cd5bb..a1ce69baaae290 100644 --- a/libc/test/src/__support/big_int_test.cpp +++ b/libc/test/src/__support/big_int_test.cpp @@ -32,6 +32,7 @@ template auto create(Value value) { case MAX: return T::max(); } + __builtin_unreachable(); } using Types = testing::TypeList< // @@ -264,7 +265,11 @@ TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat128) { TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat16) { static_assert(cpp::is_trivially_copyable::value); static_assert(sizeof(LL_UInt16) == sizeof(float16)); - const float16 array[] = {0, 0.1, 1}; + const float16 array[] = { + static_cast(0.0), + static_cast(0.1), + static_cast(1.0), + }; for (float16 value : array) { LL_UInt16 back = cpp::bit_cast(value); float16 forth = cpp::bit_cast(back); diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h index 2b1c6432675905..d3e557a8479534 100644 --- a/libc/test/src/math/RoundToIntegerTest.h +++ b/libc/test/src/math/RoundToIntegerTest.h @@ -167,7 +167,9 @@ class RoundToIntegerTestTemplate } void do_fractions_test(RoundToIntegerFunc func, int mode) { - constexpr F FRACTIONS[] = {0.5, -0.5, 0.115, -0.115, 0.715, -0.715}; + constexpr F FRACTIONS[] = { + F(0.5), F(-0.5), F(0.115), F(-0.115), F(0.715), F(-0.715), + }; for (F x : FRACTIONS) { long mpfr_long_result; bool erangeflag; diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt index c7241317687e0a..a75becba04d07e 100644 --- a/libc/test/src/math/performance_testing/CMakeLists.txt +++ b/libc/test/src/math/performance_testing/CMakeLists.txt @@ -21,7 +21,7 @@ function(add_perf_binary target_name) "PERF" "" # No optional arguments "SUITE;CXX_STANDARD" # Single value arguments - "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments + "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;LINK_LIBRARIES" # Multi-value arguments ${ARGN} ) if(NOT PERF_SRCS) @@ -64,9 +64,13 @@ function(add_perf_binary target_name) ) endif() + set(link_libraries ${link_object_files}) + foreach(lib IN LISTS PERF_LINK_LIBRARIES) + list(APPEND link_libraries ${lib}.unit) + endforeach() target_link_libraries( ${fq_target_name} - PRIVATE ${link_object_files} libc_diff_test_utils) + PRIVATE ${link_libraries} libc_diff_test_utils) set_target_properties(${fq_target_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -385,12 +389,16 @@ add_perf_binary( libc.src.math.ceilf16 libc.src.math.floorf libc.src.math.floorf16 - libc.src.math.roundevenf - libc.src.math.roundevenf16 + libc.src.math.rintf + libc.src.math.rintf16 libc.src.math.roundf libc.src.math.roundf16 + libc.src.math.roundevenf + libc.src.math.roundevenf16 libc.src.math.truncf libc.src.math.truncf16 COMPILE_OPTIONS -fno-builtin + LINK_LIBRARIES + LibcFPTestHelpers ) diff --git a/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp index 24176a377e9d48..b7bd6636a72e1f 100644 --- a/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp +++ b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp @@ -11,17 +11,23 @@ #include "src/math/ceilf16.h" #include "src/math/floorf.h" #include "src/math/floorf16.h" +#include "src/math/rintf.h" +#include "src/math/rintf16.h" #include "src/math/roundevenf.h" #include "src/math/roundevenf16.h" #include "src/math/roundf.h" #include "src/math/roundf16.h" #include "src/math/truncf.h" #include "src/math/truncf16.h" +#include "test/UnitTest/RoundingModeUtils.h" #include "test/src/math/performance_testing/Timer.h" #include #include +using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode; +using LIBC_NAMESPACE::fputil::testing::RoundingMode; + namespace LIBC_NAMESPACE::testing { template class NearestIntegerPerf { @@ -36,7 +42,7 @@ template class NearestIntegerPerf { StorageType ending_bit, StorageType step, size_t rounds, std::ofstream &log) { auto runner = [=](Func func) { - volatile T result; + [[maybe_unused]] volatile T result; for (size_t i = 0; i < rounds; i++) { for (StorageType bits = starting_bit; bits <= ending_bit; bits += step) { @@ -146,10 +152,10 @@ int main() { FLOAT16_ROUNDS, "ceilf16_perf.log") NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholderf16, FLOAT16_ROUNDS, "floorf16_perf.log") - NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16, - FLOAT16_ROUNDS, "roundevenf16_perf.log") NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholderf16, FLOAT16_ROUNDS, "roundf16_perf.log") + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16, + FLOAT16_ROUNDS, "roundevenf16_perf.log") NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholderf16, FLOAT16_ROUNDS, "truncf16_perf.log") @@ -157,12 +163,37 @@ int main() { "ceilf_perf.log") NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS, "floorf_perf.log") - NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf, - FLOAT_ROUNDS, "roundevenf_perf.log") NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS, "roundf_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf, + FLOAT_ROUNDS, "roundevenf_perf.log") NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS, "truncf_perf.log") + if (ForceRoundingMode r(RoundingMode::Upward); r.success) { + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::rintf16, ::placeholderf16, + FLOAT16_ROUNDS, "rintf16_upward_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::rintf, ::rintf, FLOAT_ROUNDS, + "rintf_upward_perf.log") + } + if (ForceRoundingMode r(RoundingMode::Downward); r.success) { + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::rintf16, ::placeholderf16, + FLOAT16_ROUNDS, "rintf16_downward_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::rintf, ::rintf, FLOAT_ROUNDS, + "rintf_downward_perf.log") + } + if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::rintf16, ::placeholderf16, + FLOAT16_ROUNDS, "rintf16_towardzero_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::rintf, ::rintf, FLOAT_ROUNDS, + "rintf_towardzero_perf.log") + } + if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::rintf16, ::placeholderf16, + FLOAT16_ROUNDS, "rintf16_nearest_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::rintf, ::rintf, FLOAT_ROUNDS, + "rintf_nearest_perf.log") + } + return 0; } diff --git a/libc/test/src/math/smoke/FMaxTest.h b/libc/test/src/math/smoke/FMaxTest.h index f4c78b5d04b5b8..1cb105aa007785 100644 --- a/libc/test/src/math/smoke/FMaxTest.h +++ b/libc/test/src/math/smoke/FMaxTest.h @@ -25,8 +25,8 @@ class FMaxTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMaxFunc func) { EXPECT_FP_EQ(inf, func(aNaN, inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -34,25 +34,25 @@ class FMaxTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMaxFunc func) { EXPECT_FP_EQ(inf, func(neg_inf, inf)); - EXPECT_FP_EQ(inf, func(inf, 0.0)); - EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, inf)); EXPECT_FP_EQ(inf, func(inf, T(1.2345))); EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); } void testNegInfArg(FMaxFunc func) { EXPECT_FP_EQ(inf, func(inf, neg_inf)); - EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_inf)); EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); } void testBothZero(FMaxFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMaxFunc func) { diff --git a/libc/test/src/math/smoke/FMaximumMagNumTest.h b/libc/test/src/math/smoke/FMaximumMagNumTest.h index 726f87059fc64b..b52169e5e86807 100644 --- a/libc/test/src/math/smoke/FMaximumMagNumTest.h +++ b/libc/test/src/math/smoke/FMaximumMagNumTest.h @@ -30,10 +30,10 @@ class FMaximumMagNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_inf, sNaN), FE_INVALID); EXPECT_EQ(FPBits(aNaN).uintval(), FPBits(func(aNaN, aNaN)).uintval()); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); - EXPECT_FP_EQ_WITH_EXCEPTION(0.0, func(sNaN, 0.0), FE_INVALID); - EXPECT_FP_EQ_WITH_EXCEPTION(-0.0, func(-0.0, sNaN), FE_INVALID); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(sNaN, zero), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_zero, sNaN), FE_INVALID); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(T(-1.2345), func(sNaN, T(-1.2345)), FE_INVALID); @@ -47,25 +47,25 @@ class FMaximumMagNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMaximumMagNumFunc func) { EXPECT_FP_EQ(inf, func(neg_inf, inf)); - EXPECT_FP_EQ(inf, func(inf, 0.0)); - EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, inf)); EXPECT_FP_EQ(inf, func(inf, T(1.2345))); EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); } void testNegInfArg(FMaximumMagNumFunc func) { EXPECT_FP_EQ(inf, func(inf, neg_inf)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); - EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_inf, func(neg_zero, neg_inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); } void testBothZero(FMaximumMagNumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMaximumMagNumFunc func) { diff --git a/libc/test/src/math/smoke/FMaximumMagTest.h b/libc/test/src/math/smoke/FMaximumMagTest.h index b5b2c1ca79abc4..81a232d96ec941 100644 --- a/libc/test/src/math/smoke/FMaximumMagTest.h +++ b/libc/test/src/math/smoke/FMaximumMagTest.h @@ -26,8 +26,8 @@ class FMaximumMagTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMaximumMagFunc func) { EXPECT_FP_EQ(aNaN, func(aNaN, inf)); EXPECT_FP_EQ(aNaN, func(neg_inf, aNaN)); - EXPECT_FP_EQ(aNaN, func(aNaN, 0.0)); - EXPECT_FP_EQ(aNaN, func(-0.0, aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, zero)); + EXPECT_FP_EQ(aNaN, func(neg_zero, aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, T(-1.2345))); EXPECT_FP_EQ(aNaN, func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -35,25 +35,25 @@ class FMaximumMagTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMaximumMagFunc func) { EXPECT_FP_EQ(inf, func(neg_inf, inf)); - EXPECT_FP_EQ(inf, func(inf, 0.0)); - EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, inf)); EXPECT_FP_EQ(inf, func(inf, T(1.2345))); EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); } void testNegInfArg(FMaximumMagFunc func) { EXPECT_FP_EQ(inf, func(inf, neg_inf)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); - EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_inf, func(neg_zero, neg_inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); } void testBothZero(FMaximumMagFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMaximumMagFunc func) { diff --git a/libc/test/src/math/smoke/FMaximumNumTest.h b/libc/test/src/math/smoke/FMaximumNumTest.h index ec7913509d3942..f4e05b9f455ed2 100644 --- a/libc/test/src/math/smoke/FMaximumNumTest.h +++ b/libc/test/src/math/smoke/FMaximumNumTest.h @@ -29,10 +29,10 @@ class FMaximumNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_inf, sNaN), FE_INVALID); EXPECT_EQ(FPBits(aNaN).uintval(), FPBits(func(aNaN, aNaN)).uintval()); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); - EXPECT_FP_EQ_WITH_EXCEPTION(0.0, func(sNaN, 0.0), FE_INVALID); - EXPECT_FP_EQ_WITH_EXCEPTION(-0.0, func(-0.0, sNaN), FE_INVALID); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(sNaN, zero), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_zero, sNaN), FE_INVALID); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(T(-1.2345), func(sNaN, T(-1.2345)), FE_INVALID); @@ -46,25 +46,25 @@ class FMaximumNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMaximumNumFunc func) { EXPECT_FP_EQ(inf, func(neg_inf, inf)); - EXPECT_FP_EQ(inf, func(inf, 0.0)); - EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, inf)); EXPECT_FP_EQ(inf, func(inf, T(1.2345))); EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); } void testNegInfArg(FMaximumNumFunc func) { EXPECT_FP_EQ(inf, func(inf, neg_inf)); - EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_inf)); EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); } void testBothZero(FMaximumNumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMaximumNumFunc func) { diff --git a/libc/test/src/math/smoke/FMaximumTest.h b/libc/test/src/math/smoke/FMaximumTest.h index 94e4a343190a5b..5e71a41d7b345a 100644 --- a/libc/test/src/math/smoke/FMaximumTest.h +++ b/libc/test/src/math/smoke/FMaximumTest.h @@ -25,8 +25,8 @@ class FMaximumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMaximumFunc func) { EXPECT_FP_EQ(aNaN, func(aNaN, inf)); EXPECT_FP_EQ(aNaN, func(neg_inf, aNaN)); - EXPECT_FP_EQ(aNaN, func(aNaN, 0.0)); - EXPECT_FP_EQ(aNaN, func(-0.0, aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, zero)); + EXPECT_FP_EQ(aNaN, func(neg_zero, aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, T(-1.2345))); EXPECT_FP_EQ(aNaN, func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -34,25 +34,25 @@ class FMaximumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMaximumFunc func) { EXPECT_FP_EQ(inf, func(neg_inf, inf)); - EXPECT_FP_EQ(inf, func(inf, 0.0)); - EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, inf)); EXPECT_FP_EQ(inf, func(inf, T(1.2345))); EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); } void testNegInfArg(FMaximumFunc func) { EXPECT_FP_EQ(inf, func(inf, neg_inf)); - EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_inf)); EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); } void testBothZero(FMaximumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMaximumFunc func) { diff --git a/libc/test/src/math/smoke/FMinTest.h b/libc/test/src/math/smoke/FMinTest.h index 629aaab729a86c..049d94eb1b3401 100644 --- a/libc/test/src/math/smoke/FMinTest.h +++ b/libc/test/src/math/smoke/FMinTest.h @@ -25,8 +25,8 @@ class FMinTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMinFunc func) { EXPECT_FP_EQ(inf, func(aNaN, inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -34,25 +34,25 @@ class FMinTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMinFunc func) { EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); - EXPECT_FP_EQ(0.0, func(inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(zero, func(inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, inf)); EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); EXPECT_FP_EQ(T(-1.2345), func(T(-1.2345), inf)); } void testNegInfArg(FMinFunc func) { EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); - EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_inf, func(neg_zero, neg_inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); } void testBothZero(FMinFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMinFunc func) { diff --git a/libc/test/src/math/smoke/FMinimumMagNumTest.h b/libc/test/src/math/smoke/FMinimumMagNumTest.h index 2ceca6ff95bac2..4cec6f08b2daa1 100644 --- a/libc/test/src/math/smoke/FMinimumMagNumTest.h +++ b/libc/test/src/math/smoke/FMinimumMagNumTest.h @@ -30,10 +30,10 @@ class FMinimumMagNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_inf, sNaN), FE_INVALID); EXPECT_EQ(FPBits(aNaN).uintval(), FPBits(func(aNaN, aNaN)).uintval()); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); - EXPECT_FP_EQ_WITH_EXCEPTION(0.0, func(sNaN, 0.0), FE_INVALID); - EXPECT_FP_EQ_WITH_EXCEPTION(-0.0, func(-0.0, sNaN), FE_INVALID); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(sNaN, zero), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_zero, sNaN), FE_INVALID); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(T(-1.2345), func(sNaN, T(-1.2345)), FE_INVALID); @@ -47,25 +47,25 @@ class FMinimumMagNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMinimumMagNumFunc func) { EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); - EXPECT_FP_EQ(0.0, func(inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(zero, func(inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, inf)); EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); EXPECT_FP_EQ(T(-1.2345), func(T(-1.2345), inf)); } void testNegInfArg(FMinimumMagNumFunc func) { EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); - EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_inf)); EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); } void testBothZero(FMinimumMagNumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMinimumMagNumFunc func) { diff --git a/libc/test/src/math/smoke/FMinimumMagTest.h b/libc/test/src/math/smoke/FMinimumMagTest.h index 9c49446795ceef..18b43815bdeca8 100644 --- a/libc/test/src/math/smoke/FMinimumMagTest.h +++ b/libc/test/src/math/smoke/FMinimumMagTest.h @@ -26,8 +26,8 @@ class FMinimumMagTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMinimumMagFunc func) { EXPECT_FP_EQ(aNaN, func(aNaN, inf)); EXPECT_FP_EQ(aNaN, func(neg_inf, aNaN)); - EXPECT_FP_EQ(aNaN, func(aNaN, 0.0)); - EXPECT_FP_EQ(aNaN, func(-0.0, aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, zero)); + EXPECT_FP_EQ(aNaN, func(neg_zero, aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, T(-1.2345))); EXPECT_FP_EQ(aNaN, func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -35,25 +35,25 @@ class FMinimumMagTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMinimumMagFunc func) { EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); - EXPECT_FP_EQ(0.0, func(inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(zero, func(inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, inf)); EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); EXPECT_FP_EQ(T(-1.2345), func(T(-1.2345), inf)); } void testNegInfArg(FMinimumMagFunc func) { EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); - EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_inf)); EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); } void testBothZero(FMinimumMagFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMinimumMagFunc func) { diff --git a/libc/test/src/math/smoke/FMinimumNumTest.h b/libc/test/src/math/smoke/FMinimumNumTest.h index 8004ee98745432..dddcdc28d30c83 100644 --- a/libc/test/src/math/smoke/FMinimumNumTest.h +++ b/libc/test/src/math/smoke/FMinimumNumTest.h @@ -29,10 +29,10 @@ class FMinimumNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_inf, sNaN), FE_INVALID); EXPECT_EQ(FPBits(aNaN).uintval(), FPBits(func(aNaN, aNaN)).uintval()); - EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); - EXPECT_FP_EQ_WITH_EXCEPTION(0.0, func(sNaN, 0.0), FE_INVALID); - EXPECT_FP_EQ_WITH_EXCEPTION(-0.0, func(-0.0, sNaN), FE_INVALID); + EXPECT_FP_EQ(zero, func(aNaN, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, aNaN)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(sNaN, zero), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, func(neg_zero, sNaN), FE_INVALID); EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); EXPECT_FP_EQ_WITH_EXCEPTION(T(-1.2345), func(sNaN, T(-1.2345)), FE_INVALID); @@ -46,25 +46,25 @@ class FMinimumNumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMinimumNumFunc func) { EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); - EXPECT_FP_EQ(0.0, func(inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(zero, func(inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, inf)); EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); EXPECT_FP_EQ(T(-1.2345), func(T(-1.2345), inf)); } void testNegInfArg(FMinimumNumFunc func) { EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); - EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_inf, func(neg_zero, neg_inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); } void testBothZero(FMinimumNumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMinimumNumFunc func) { diff --git a/libc/test/src/math/smoke/FMinimumTest.h b/libc/test/src/math/smoke/FMinimumTest.h index 242c857fbb99bc..b5c0e98d17b99c 100644 --- a/libc/test/src/math/smoke/FMinimumTest.h +++ b/libc/test/src/math/smoke/FMinimumTest.h @@ -25,8 +25,8 @@ class FMinimumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testNaN(FMinimumFunc func) { EXPECT_FP_EQ(aNaN, func(aNaN, inf)); EXPECT_FP_EQ(aNaN, func(neg_inf, aNaN)); - EXPECT_FP_EQ(aNaN, func(aNaN, 0.0)); - EXPECT_FP_EQ(aNaN, func(-0.0, aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, zero)); + EXPECT_FP_EQ(aNaN, func(neg_zero, aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, T(-1.2345))); EXPECT_FP_EQ(aNaN, func(T(1.2345), aNaN)); EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); @@ -34,25 +34,25 @@ class FMinimumTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testInfArg(FMinimumFunc func) { EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); - EXPECT_FP_EQ(0.0, func(inf, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(zero, func(inf, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, inf)); EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); EXPECT_FP_EQ(T(1.2345), func(T(1.2345), inf)); } void testNegInfArg(FMinimumFunc func) { EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); - EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); - EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, zero)); + EXPECT_FP_EQ(neg_inf, func(neg_zero, neg_inf)); EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); } void testBothZero(FMinimumFunc func) { - EXPECT_FP_EQ(0.0, func(0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); - EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); - EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, zero)); + EXPECT_FP_EQ(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero, neg_zero)); } void testRange(FMinimumFunc func) { diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h index 405e3107438d42..0a4227da83f81d 100644 --- a/libc/test/src/math/smoke/FModTest.h +++ b/libc/test/src/math/smoke/FModTest.h @@ -35,16 +35,16 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testSpecialNumbers(FModFunc f) { // fmod (+0, y) == +0 for y != 0. - TEST_SPECIAL(0.0, 3.0, 0.0, false, 0); - TEST_SPECIAL(0.0, min_denormal, 0.0, false, 0); - TEST_SPECIAL(0.0, -min_denormal, 0.0, false, 0); - TEST_SPECIAL(0.0, min_normal, 0.0, false, 0); - TEST_SPECIAL(0.0, -min_normal, 0.0, false, 0); - TEST_SPECIAL(0.0, max_normal, 0.0, false, 0); - TEST_SPECIAL(0.0, -max_normal, 0.0, false, 0); + TEST_SPECIAL(zero, T(3.0), zero, false, 0); + TEST_SPECIAL(zero, min_denormal, zero, false, 0); + TEST_SPECIAL(zero, -min_denormal, zero, false, 0); + TEST_SPECIAL(zero, min_normal, zero, false, 0); + TEST_SPECIAL(zero, -min_normal, zero, false, 0); + TEST_SPECIAL(zero, max_normal, zero, false, 0); + TEST_SPECIAL(zero, -max_normal, zero, false, 0); // fmod (-0, y) == -0 for y != 0. - TEST_SPECIAL(neg_zero, 3.0, neg_zero, false, 0); + TEST_SPECIAL(neg_zero, T(3.0), neg_zero, false, 0); TEST_SPECIAL(neg_zero, min_denormal, neg_zero, false, 0); TEST_SPECIAL(neg_zero, -min_denormal, neg_zero, false, 0); TEST_SPECIAL(neg_zero, min_normal, neg_zero, false, 0); @@ -53,9 +53,9 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(neg_zero, -max_normal, neg_zero, false, 0); // fmod (+inf, y) == aNaN plus invalid exception. - TEST_SPECIAL(inf, 3.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(inf, -1.1L, aNaN, true, FE_INVALID); - TEST_SPECIAL(inf, 0.0, aNaN, true, FE_INVALID); + TEST_SPECIAL(inf, T(3.0), aNaN, true, FE_INVALID); + TEST_SPECIAL(inf, T(-1.1), aNaN, true, FE_INVALID); + TEST_SPECIAL(inf, zero, aNaN, true, FE_INVALID); TEST_SPECIAL(inf, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(inf, min_denormal, aNaN, true, FE_INVALID); TEST_SPECIAL(inf, min_normal, aNaN, true, FE_INVALID); @@ -64,9 +64,9 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(inf, neg_inf, aNaN, true, FE_INVALID); // fmod (-inf, y) == aNaN plus invalid exception. - TEST_SPECIAL(neg_inf, 3.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(neg_inf, -1.1L, aNaN, true, FE_INVALID); - TEST_SPECIAL(neg_inf, 0.0, aNaN, true, FE_INVALID); + TEST_SPECIAL(neg_inf, T(3.0), aNaN, true, FE_INVALID); + TEST_SPECIAL(neg_inf, T(-1.1), aNaN, true, FE_INVALID); + TEST_SPECIAL(neg_inf, zero, aNaN, true, FE_INVALID); TEST_SPECIAL(neg_inf, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(neg_inf, min_denormal, aNaN, true, FE_INVALID); TEST_SPECIAL(neg_inf, min_normal, aNaN, true, FE_INVALID); @@ -75,74 +75,74 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(neg_inf, neg_inf, aNaN, true, FE_INVALID); // fmod (x, +0) == aNaN plus invalid exception. - TEST_SPECIAL(3.0, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(-1.1L, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(0.0, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(neg_zero, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(min_denormal, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(min_normal, 0.0, aNaN, true, FE_INVALID); - TEST_SPECIAL(max_normal, 0.0, aNaN, true, FE_INVALID); + TEST_SPECIAL(T(3.0), zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(T(-1.1), zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(zero, zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(neg_zero, zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(min_denormal, zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(min_normal, zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(max_normal, zero, aNaN, true, FE_INVALID); // fmod (x, -0) == aNaN plus invalid exception. - TEST_SPECIAL(3.0, neg_zero, aNaN, true, FE_INVALID); - TEST_SPECIAL(-1.1L, neg_zero, aNaN, true, FE_INVALID); - TEST_SPECIAL(0.0, neg_zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(T(3.0), neg_zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(T(-1.1), neg_zero, aNaN, true, FE_INVALID); + TEST_SPECIAL(zero, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(neg_zero, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(min_denormal, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(min_normal, neg_zero, aNaN, true, FE_INVALID); TEST_SPECIAL(max_normal, neg_zero, aNaN, true, FE_INVALID); // fmod (x, +inf) == x for x not infinite. - TEST_SPECIAL(0.0, inf, 0.0, false, 0); + TEST_SPECIAL(zero, inf, zero, false, 0); TEST_SPECIAL(neg_zero, inf, neg_zero, false, 0); TEST_SPECIAL(min_denormal, inf, min_denormal, false, 0); TEST_SPECIAL(min_normal, inf, min_normal, false, 0); TEST_SPECIAL(max_normal, inf, max_normal, false, 0); - TEST_SPECIAL(3.0, inf, 3.0, false, 0); + TEST_SPECIAL(T(3.0), inf, T(3.0), false, 0); // fmod (x, -inf) == x for x not infinite. - TEST_SPECIAL(0.0, neg_inf, 0.0, false, 0); + TEST_SPECIAL(zero, neg_inf, zero, false, 0); TEST_SPECIAL(neg_zero, neg_inf, neg_zero, false, 0); TEST_SPECIAL(min_denormal, neg_inf, min_denormal, false, 0); TEST_SPECIAL(min_normal, neg_inf, min_normal, false, 0); TEST_SPECIAL(max_normal, neg_inf, max_normal, false, 0); - TEST_SPECIAL(3.0, neg_inf, 3.0, false, 0); + TEST_SPECIAL(T(3.0), neg_inf, T(3.0), false, 0); - TEST_SPECIAL(0.0, aNaN, aNaN, false, 0); - TEST_SPECIAL(0.0, -aNaN, aNaN, false, 0); + TEST_SPECIAL(zero, aNaN, aNaN, false, 0); + TEST_SPECIAL(zero, -aNaN, aNaN, false, 0); TEST_SPECIAL(neg_zero, aNaN, aNaN, false, 0); TEST_SPECIAL(neg_zero, -aNaN, aNaN, false, 0); - TEST_SPECIAL(1.0, aNaN, aNaN, false, 0); - TEST_SPECIAL(1.0, -aNaN, aNaN, false, 0); + TEST_SPECIAL(T(1.0), aNaN, aNaN, false, 0); + TEST_SPECIAL(T(1.0), -aNaN, aNaN, false, 0); TEST_SPECIAL(inf, aNaN, aNaN, false, 0); TEST_SPECIAL(inf, -aNaN, aNaN, false, 0); TEST_SPECIAL(neg_inf, aNaN, aNaN, false, 0); TEST_SPECIAL(neg_inf, -aNaN, aNaN, false, 0); - TEST_SPECIAL(0.0, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(0.0, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(zero, sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(zero, -sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_zero, sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_zero, -sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(1.0, sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(1.0, -sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(T(1.0), sNaN, aNaN, false, FE_INVALID); + TEST_SPECIAL(T(1.0), -sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(inf, sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(inf, -sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_inf, sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(neg_inf, -sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(aNaN, 0.0, aNaN, false, 0); - TEST_SPECIAL(-aNaN, 0.0, aNaN, false, 0); + TEST_SPECIAL(aNaN, zero, aNaN, false, 0); + TEST_SPECIAL(-aNaN, zero, aNaN, false, 0); TEST_SPECIAL(aNaN, neg_zero, aNaN, false, 0); TEST_SPECIAL(-aNaN, neg_zero, aNaN, false, 0); - TEST_SPECIAL(aNaN, 1.0, aNaN, false, 0); - TEST_SPECIAL(-aNaN, 1.0, aNaN, false, 0); + TEST_SPECIAL(aNaN, T(1.0), aNaN, false, 0); + TEST_SPECIAL(-aNaN, T(1.0), aNaN, false, 0); TEST_SPECIAL(aNaN, inf, aNaN, false, 0); TEST_SPECIAL(-aNaN, inf, aNaN, false, 0); TEST_SPECIAL(aNaN, neg_inf, aNaN, false, 0); TEST_SPECIAL(-aNaN, neg_inf, aNaN, false, 0); - TEST_SPECIAL(sNaN, 0.0, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, 0.0, aNaN, false, FE_INVALID); + TEST_SPECIAL(sNaN, zero, aNaN, false, FE_INVALID); + TEST_SPECIAL(-sNaN, zero, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, neg_zero, aNaN, false, FE_INVALID); TEST_SPECIAL(-sNaN, neg_zero, aNaN, false, FE_INVALID); - TEST_SPECIAL(sNaN, 1.0, aNaN, false, FE_INVALID); - TEST_SPECIAL(-sNaN, 1.0, aNaN, false, FE_INVALID); + TEST_SPECIAL(sNaN, T(1.0), aNaN, false, FE_INVALID); + TEST_SPECIAL(-sNaN, T(1.0), aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, inf, aNaN, false, FE_INVALID); TEST_SPECIAL(-sNaN, inf, aNaN, false, FE_INVALID); TEST_SPECIAL(sNaN, neg_inf, aNaN, false, FE_INVALID); @@ -164,17 +164,17 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(-sNaN, sNaN, aNaN, false, FE_INVALID); TEST_SPECIAL(-sNaN, -sNaN, aNaN, false, FE_INVALID); - TEST_SPECIAL(6.5, 2.25L, 2.0L, false, 0); - TEST_SPECIAL(-6.5, 2.25L, -2.0L, false, 0); - TEST_SPECIAL(6.5, -2.25L, 2.0L, false, 0); - TEST_SPECIAL(-6.5, -2.25L, -2.0L, false, 0); - - TEST_SPECIAL(max_normal, max_normal, 0.0, false, 0); - TEST_SPECIAL(max_normal, -max_normal, 0.0, false, 0); - TEST_SPECIAL(max_normal, min_normal, 0.0, false, 0); - TEST_SPECIAL(max_normal, -min_normal, 0.0, false, 0); - TEST_SPECIAL(max_normal, min_denormal, 0.0, false, 0); - TEST_SPECIAL(max_normal, -min_denormal, 0.0, false, 0); + TEST_SPECIAL(T(6.5), T(2.25), T(2.0), false, 0); + TEST_SPECIAL(T(-6.5), T(2.25), T(-2.0), false, 0); + TEST_SPECIAL(T(6.5), T(-2.25), T(2.0), false, 0); + TEST_SPECIAL(T(-6.5), T(-2.25), T(-2.0), false, 0); + + TEST_SPECIAL(max_normal, max_normal, zero, false, 0); + TEST_SPECIAL(max_normal, -max_normal, zero, false, 0); + TEST_SPECIAL(max_normal, min_normal, zero, false, 0); + TEST_SPECIAL(max_normal, -min_normal, zero, false, 0); + TEST_SPECIAL(max_normal, min_denormal, zero, false, 0); + TEST_SPECIAL(max_normal, -min_denormal, zero, false, 0); TEST_SPECIAL(-max_normal, max_normal, neg_zero, false, 0); TEST_SPECIAL(-max_normal, -max_normal, neg_zero, false, 0); TEST_SPECIAL(-max_normal, min_normal, neg_zero, false, 0); @@ -184,10 +184,10 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(min_normal, max_normal, min_normal, false, 0); TEST_SPECIAL(min_normal, -max_normal, min_normal, false, 0); - TEST_SPECIAL(min_normal, min_normal, 0.0, false, 0); - TEST_SPECIAL(min_normal, -min_normal, 0.0, false, 0); - TEST_SPECIAL(min_normal, min_denormal, 0.0, false, 0); - TEST_SPECIAL(min_normal, -min_denormal, 0.0, false, 0); + TEST_SPECIAL(min_normal, min_normal, zero, false, 0); + TEST_SPECIAL(min_normal, -min_normal, zero, false, 0); + TEST_SPECIAL(min_normal, min_denormal, zero, false, 0); + TEST_SPECIAL(min_normal, -min_denormal, zero, false, 0); TEST_SPECIAL(-min_normal, max_normal, -min_normal, false, 0); TEST_SPECIAL(-min_normal, -max_normal, -min_normal, false, 0); TEST_SPECIAL(-min_normal, min_normal, neg_zero, false, 0); @@ -199,8 +199,8 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { TEST_SPECIAL(min_denormal, -max_normal, min_denormal, false, 0); TEST_SPECIAL(min_denormal, min_normal, min_denormal, false, 0); TEST_SPECIAL(min_denormal, -min_normal, min_denormal, false, 0); - TEST_SPECIAL(min_denormal, min_denormal, 0.0, false, 0); - TEST_SPECIAL(min_denormal, -min_denormal, 0.0, false, 0); + TEST_SPECIAL(min_denormal, min_denormal, zero, false, 0); + TEST_SPECIAL(min_denormal, -min_denormal, zero, false, 0); TEST_SPECIAL(-min_denormal, max_normal, -min_denormal, false, 0); TEST_SPECIAL(-min_denormal, -max_normal, -min_denormal, false, 0); TEST_SPECIAL(-min_denormal, min_normal, -min_denormal, false, 0); @@ -212,33 +212,33 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { void testRegularExtreme(FModFunc f) { if constexpr (sizeof(T) < sizeof(float)) return; - TEST_REGULAR(0x1p127L, 0x3p-149L, 0x1p-149L); - TEST_REGULAR(0x1p127L, -0x3p-149L, 0x1p-149L); - TEST_REGULAR(0x1p127L, 0x3p-148L, 0x1p-147L); - TEST_REGULAR(0x1p127L, -0x3p-148L, 0x1p-147L); - TEST_REGULAR(0x1p127L, 0x3p-126L, 0x1p-125L); - TEST_REGULAR(0x1p127L, -0x3p-126L, 0x1p-125L); - TEST_REGULAR(-0x1p127L, 0x3p-149L, -0x1p-149L); - TEST_REGULAR(-0x1p127L, -0x3p-149L, -0x1p-149L); - TEST_REGULAR(-0x1p127L, 0x3p-148L, -0x1p-147L); - TEST_REGULAR(-0x1p127L, -0x3p-148L, -0x1p-147L); - TEST_REGULAR(-0x1p127L, 0x3p-126L, -0x1p-125L); - TEST_REGULAR(-0x1p127L, -0x3p-126L, -0x1p-125L); + TEST_REGULAR(T(0x1p127), T(0x3p-149), T(0x1p-149)); + TEST_REGULAR(T(0x1p127), T(-0x3p-149), T(0x1p-149)); + TEST_REGULAR(T(0x1p127), T(0x3p-148), T(0x1p-147)); + TEST_REGULAR(T(0x1p127), T(-0x3p-148), T(0x1p-147)); + TEST_REGULAR(T(0x1p127), T(0x3p-126), T(0x1p-125)); + TEST_REGULAR(T(0x1p127), T(-0x3p-126), T(0x1p-125)); + TEST_REGULAR(T(-0x1p127), T(0x3p-149), T(-0x1p-149)); + TEST_REGULAR(T(-0x1p127), T(-0x3p-149), T(-0x1p-149)); + TEST_REGULAR(T(-0x1p127), T(0x3p-148), T(-0x1p-147)); + TEST_REGULAR(T(-0x1p127), T(-0x3p-148), T(-0x1p-147)); + TEST_REGULAR(T(-0x1p127), T(0x3p-126), T(-0x1p-125)); + TEST_REGULAR(T(-0x1p127), T(-0x3p-126), T(-0x1p-125)); if constexpr (sizeof(T) < sizeof(double)) return; - TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L); - TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L); - TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L); - TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L); - TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L); - TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L); - TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L); - TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L); - TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L); - TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L); - TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L); - TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L); + TEST_REGULAR(T(0x1p1023), T(0x3p-1074), T(0x1p-1073)); + TEST_REGULAR(T(0x1p1023), T(-0x3p-1074), T(0x1p-1073)); + TEST_REGULAR(T(0x1p1023), T(0x3p-1073), T(0x1p-1073)); + TEST_REGULAR(T(0x1p1023), T(-0x3p-1073), T(0x1p-1073)); + TEST_REGULAR(T(0x1p1023), T(0x3p-1022), T(0x1p-1021)); + TEST_REGULAR(T(0x1p1023), T(-0x3p-1022), T(0x1p-1021)); + TEST_REGULAR(T(-0x1p1023), T(0x3p-1074), T(-0x1p-1073)); + TEST_REGULAR(T(-0x1p1023), T(-0x3p-1074), T(-0x1p-1073)); + TEST_REGULAR(T(-0x1p1023), T(0x3p-1073), T(-0x1p-1073)); + TEST_REGULAR(T(-0x1p1023), T(-0x3p-1073), T(-0x1p-1073)); + TEST_REGULAR(T(-0x1p1023), T(0x3p-1022), T(-0x1p-1021)); + TEST_REGULAR(T(-0x1p1023), T(-0x3p-1022), T(-0x1p-1021)); } }; diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h index fc2313a94ef09e..11641fc6743c44 100644 --- a/libc/test/src/math/smoke/FrexpTest.h +++ b/libc/test/src/math/smoke/FrexpTest.h @@ -24,10 +24,10 @@ class FrexpTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, &exponent)); EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, &exponent)); - EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero, &exponent)); EXPECT_EQ(exponent, 0); - EXPECT_FP_EQ_ALL_ROUNDING(-0.0, func(-0.0, &exponent)); + EXPECT_FP_EQ_ALL_ROUNDING(-zero, func(-zero, &exponent)); EXPECT_EQ(exponent, 0); } diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h index 3315ac2cbbc644..988f71f54bf0df 100644 --- a/libc/test/src/math/smoke/ILogbTest.h +++ b/libc/test/src/math/smoke/ILogbTest.h @@ -47,13 +47,13 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_EQ(OutType(2), func(InType(-4.0))); EXPECT_EQ(OutType(3), func(InType(8.0))); - EXPECT_EQ(OutType(3), func(-8.0)); + EXPECT_EQ(OutType(3), func(InType(-8.0))); - EXPECT_EQ(OutType(4), func(16.0)); - EXPECT_EQ(OutType(4), func(-16.0)); + EXPECT_EQ(OutType(4), func(InType(16.0))); + EXPECT_EQ(OutType(4), func(InType(-16.0))); - EXPECT_EQ(OutType(5), func(32.0)); - EXPECT_EQ(OutType(5), func(-32.0)); + EXPECT_EQ(OutType(5), func(InType(32.0))); + EXPECT_EQ(OutType(5), func(InType(-32.0))); } void test_some_integers(Func func) { @@ -67,10 +67,10 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { EXPECT_EQ(OutType(3), func(InType(-10.0))); EXPECT_EQ(OutType(4), func(InType(31.0))); - EXPECT_EQ(OutType(4), func(-31.0)); + EXPECT_EQ(OutType(4), func(InType(-31.0))); - EXPECT_EQ(OutType(5), func(55.0)); - EXPECT_EQ(OutType(5), func(-55.0)); + EXPECT_EQ(OutType(5), func(InType(55.0))); + EXPECT_EQ(OutType(5), func(InType(-55.0))); } void test_subnormal_range(Func func) { diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h index 0bb6e12665b934..a9f34e7517b831 100644 --- a/libc/test/src/math/smoke/LogbTest.h +++ b/libc/test/src/math/smoke/LogbTest.h @@ -27,8 +27,8 @@ class LogbTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { ASSERT_FP_EQ(aNaN, func(aNaN)); ASSERT_FP_EQ(inf, func(inf)); ASSERT_FP_EQ(inf, func(neg_inf)); - ASSERT_FP_EQ(neg_inf, func(0.0)); - ASSERT_FP_EQ(neg_inf, func(-0.0)); + ASSERT_FP_EQ(neg_inf, func(zero)); + ASSERT_FP_EQ(neg_inf, func(neg_zero)); } void testPowersOfTwo(LogbFunc func) { diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index be7c06fa918fd3..45d35ad3930da3 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -630,7 +630,7 @@ TEST(LlvmLibcSPrintfTest, PointerConv) { ASSERT_STREQ(buff, "0x1a2b3c4d5e6f7081"); } - written = LIBC_NAMESPACE::sprintf(buff, "%p", buff); + written = LIBC_NAMESPACE::sprintf(buff, "%p", &written); EXPECT_GT(written, 0); // Width tests: @@ -1687,9 +1687,6 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) { TEST_F(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) { ForceRoundingMode r(RoundingMode::Nearest); - char big_buff[10000]; // Used for long doubles and other extremely wide - // numbers. - // Length Modifier Tests. // TODO(michaelrj): Add tests for LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64 and 128 @@ -1741,6 +1738,8 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) { "000000000000000000000000000000000000000000000000000000000000000000000000" "00000000000000000000000000000000000000000000000000000000000000000000"); + char big_buff[10000]; // Used for extremely wide numbers. + written = LIBC_NAMESPACE::sprintf(big_buff, "%Lf", 1e1000L); ASSERT_STREQ_LEN( written, big_buff, diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 38488778c657c3..db90d9a4741ebf 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -290,14 +290,39 @@ add_libc_test( libc.src.stdlib.bsearch ) +add_libc_test( + quick_sort_test + SUITE + libc-stdlib-tests + SRCS + quick_sort_test.cpp + HDRS + SortingTest.h + DEPENDS + libc.src.stdlib.qsort_util +) + +add_libc_test( + heap_sort_test + SUITE + libc-stdlib-tests + SRCS + heap_sort_test.cpp + HDRS + SortingTest.h + DEPENDS + libc.src.stdlib.qsort_util +) + add_libc_test( qsort_test SUITE libc-stdlib-tests SRCS qsort_test.cpp + HDRS + SortingTest.h DEPENDS - libc.include.stdlib libc.src.stdlib.qsort ) diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h new file mode 100644 index 00000000000000..d34584e5addf03 --- /dev/null +++ b/libc/test/src/stdlib/SortingTest.h @@ -0,0 +1,377 @@ +//===-- Unittests for sorting routines ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/macros/config.h" +#include "src/stdlib/qsort_data.h" +#include "test/UnitTest/Test.h" + +class SortingTest : public LIBC_NAMESPACE::testing::Test { + + using Array = LIBC_NAMESPACE::internal::Array; + using Comparator = LIBC_NAMESPACE::internal::Comparator; + using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine; + +public: + static int int_compare(const void *l, const void *r) { + int li = *reinterpret_cast(l); + int ri = *reinterpret_cast(r); + if (li == ri) + return 0; + else if (li > ri) + return 1; + else + return -1; + } + + void test_sorted_array(SortingRoutine sort_func) { + int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, + 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, + 1133, 1135, 1155, 1170, 1171, 11100, 12310}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_LE(array[0], 10); + ASSERT_LE(array[1], 23); + ASSERT_LE(array[2], 33); + ASSERT_LE(array[3], 35); + ASSERT_LE(array[4], 55); + ASSERT_LE(array[5], 70); + ASSERT_LE(array[6], 71); + ASSERT_LE(array[7], 100); + ASSERT_LE(array[8], 110); + ASSERT_LE(array[9], 123); + ASSERT_LE(array[10], 133); + ASSERT_LE(array[11], 135); + ASSERT_LE(array[12], 155); + ASSERT_LE(array[13], 170); + ASSERT_LE(array[14], 171); + ASSERT_LE(array[15], 1100); + ASSERT_LE(array[16], 1110); + ASSERT_LE(array[17], 1123); + ASSERT_LE(array[18], 1133); + ASSERT_LE(array[19], 1135); + ASSERT_LE(array[20], 1155); + ASSERT_LE(array[21], 1170); + ASSERT_LE(array[22], 1171); + ASSERT_LE(array[23], 11100); + ASSERT_LE(array[24], 12310); + } + + void test_reversed_sorted_array(SortingRoutine sort_func) { + int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + for (int i = 0; i < int(ARRAY_SIZE - 1); ++i) + ASSERT_EQ(array[i], i + 1); + } + + void test_all_equal_elements(SortingRoutine sort_func) { + int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + for (size_t i = 0; i < ARRAY_SIZE; ++i) + ASSERT_EQ(array[i], 100); + } + + void test_unsorted_array_1(SortingRoutine sort_func) { + int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, + 123, 90, 80, 70, 60, 171, 11, 1, -1, + -5, -10, 1155, 1170, 1171, 12, -100}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], -100); + ASSERT_EQ(array[1], -10); + ASSERT_EQ(array[2], -5); + ASSERT_EQ(array[3], -1); + ASSERT_EQ(array[4], 1); + ASSERT_EQ(array[5], 8); + ASSERT_EQ(array[6], 10); + ASSERT_EQ(array[7], 11); + ASSERT_EQ(array[8], 12); + ASSERT_EQ(array[9], 23); + ASSERT_EQ(array[10], 35); + ASSERT_EQ(array[11], 40); + ASSERT_EQ(array[12], 45); + ASSERT_EQ(array[13], 55); + ASSERT_EQ(array[14], 60); + ASSERT_EQ(array[15], 70); + ASSERT_EQ(array[16], 80); + ASSERT_EQ(array[17], 90); + ASSERT_EQ(array[18], 100); + ASSERT_EQ(array[19], 110); + ASSERT_EQ(array[20], 123); + ASSERT_EQ(array[21], 171); + ASSERT_EQ(array[22], 1155); + ASSERT_EQ(array[23], 1170); + ASSERT_EQ(array[24], 1171); + } + + void test_unsorted_array_2(SortingRoutine sort_func) { + int array[7] = {10, 40, 45, 55, 35, 23, 60}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 10); + ASSERT_EQ(array[1], 23); + ASSERT_EQ(array[2], 35); + ASSERT_EQ(array[3], 40); + ASSERT_EQ(array[4], 45); + ASSERT_EQ(array[5], 55); + ASSERT_EQ(array[6], 60); + } + + void test_unsorted_array_duplicated_1(SortingRoutine sort_func) { + int array[6] = {10, 10, 20, 20, 5, 5}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 5); + ASSERT_EQ(array[1], 5); + ASSERT_EQ(array[2], 10); + ASSERT_EQ(array[3], 10); + ASSERT_EQ(array[4], 20); + ASSERT_EQ(array[5], 20); + } + + void test_unsorted_array_duplicated_2(SortingRoutine sort_func) { + int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 10); + ASSERT_EQ(array[1], 10); + ASSERT_EQ(array[2], 10); + ASSERT_EQ(array[3], 10); + ASSERT_EQ(array[4], 20); + ASSERT_EQ(array[5], 20); + ASSERT_EQ(array[6], 21); + ASSERT_EQ(array[7], 21); + ASSERT_EQ(array[8], 21); + ASSERT_EQ(array[9], 21); + } + + void test_unsorted_array_duplicated_3(SortingRoutine sort_func) { + int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 20); + ASSERT_EQ(array[1], 20); + ASSERT_EQ(array[2], 21); + ASSERT_EQ(array[3], 21); + ASSERT_EQ(array[4], 21); + ASSERT_EQ(array[5], 21); + ASSERT_EQ(array[6], 30); + ASSERT_EQ(array[7], 30); + ASSERT_EQ(array[8], 30); + ASSERT_EQ(array[9], 30); + } + + void test_unsorted_three_element_1(SortingRoutine sort_func) { + int array[3] = {14999024, 0, 3}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 0); + ASSERT_EQ(array[1], 3); + ASSERT_EQ(array[2], 14999024); + } + + void test_unsorted_three_element_2(SortingRoutine sort_func) { + int array[3] = {3, 14999024, 0}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 0); + ASSERT_EQ(array[1], 3); + ASSERT_EQ(array[2], 14999024); + } + + void test_unsorted_three_element_3(SortingRoutine sort_func) { + int array[3] = {3, 0, 14999024}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 0); + ASSERT_EQ(array[1], 3); + ASSERT_EQ(array[2], 14999024); + } + + void test_same_three_element(SortingRoutine sort_func) { + int array[3] = {12345, 12345, 12345}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 12345); + ASSERT_EQ(array[1], 12345); + ASSERT_EQ(array[2], 12345); + } + + void test_unsorted_two_element_1(SortingRoutine sort_func) { + int array[] = {14999024, 0}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 0); + ASSERT_EQ(array[1], 14999024); + } + + void test_unsorted_two_element_2(SortingRoutine sort_func) { + int array[] = {0, 14999024}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 0); + ASSERT_EQ(array[1], 14999024); + } + + void test_same_two_element(SortingRoutine sort_func) { + int array[] = {12345, 12345}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 12345); + ASSERT_EQ(array[1], 12345); + } + + void test_single_element(SortingRoutine sort_func) { + int array[] = {12345}; + + constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); + + auto arr = Array(reinterpret_cast(array), ARRAY_SIZE, + sizeof(int), Comparator(int_compare)); + + sort_func(arr); + + ASSERT_EQ(array[0], 12345); + } +}; + +#define LIST_SORTING_TESTS(Name, Func) \ + using LlvmLibc##Name##Test = SortingTest; \ + TEST_F(LlvmLibc##Name##Test, SortedArray) { test_sorted_array(Func); } \ + TEST_F(LlvmLibc##Name##Test, ReverseSortedArray) { \ + test_reversed_sorted_array(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, AllEqualElements) { \ + test_all_equal_elements(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedArray1) { \ + test_unsorted_array_1(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedArray2) { \ + test_unsorted_array_2(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedArrayDuplicateElements1) { \ + test_unsorted_array_duplicated_1(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedArrayDuplicateElements2) { \ + test_unsorted_array_duplicated_2(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedArrayDuplicateElements3) { \ + test_unsorted_array_duplicated_3(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedThreeElementArray1) { \ + test_unsorted_three_element_1(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedThreeElementArray2) { \ + test_unsorted_three_element_2(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedThreeElementArray3) { \ + test_unsorted_three_element_3(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, SameElementThreeElementArray) { \ + test_same_three_element(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedTwoElementArray1) { \ + test_unsorted_two_element_1(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, UnsortedTwoElementArray2) { \ + test_unsorted_two_element_2(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, SameElementTwoElementArray) { \ + test_same_two_element(Func); \ + } \ + TEST_F(LlvmLibc##Name##Test, SingleElementArray) { \ + test_single_element(Func); \ + } \ + static_assert(true) diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp new file mode 100644 index 00000000000000..d70e3dc2272beb --- /dev/null +++ b/libc/test/src/stdlib/heap_sort_test.cpp @@ -0,0 +1,16 @@ +//===-- Unittests for heap sort -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SortingTest.h" +#include "src/stdlib/heap_sort.h" + +void sort(const LIBC_NAMESPACE::internal::Array &array) { + LIBC_NAMESPACE::internal::heap_sort(array); +} + +LIST_SORTING_TESTS(HeapSort, sort); diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp index 0822d490e65208..1e921a86fd1fd3 100644 --- a/libc/test/src/stdlib/qsort_test.cpp +++ b/libc/test/src/stdlib/qsort_test.cpp @@ -6,260 +6,12 @@ // //===----------------------------------------------------------------------===// +#include "SortingTest.h" #include "src/stdlib/qsort.h" -#include "test/UnitTest/Test.h" - -#include - -static int int_compare(const void *l, const void *r) { - int li = *reinterpret_cast(l); - int ri = *reinterpret_cast(r); - if (li == ri) - return 0; - else if (li > ri) - return 1; - else - return -1; +void sort(const LIBC_NAMESPACE::internal::Array &array) { + LIBC_NAMESPACE::qsort(reinterpret_cast(array.get(0)), array.size(), + sizeof(int), SortingTest::int_compare); } -TEST(LlvmLibcQsortTest, SortedArray) { - int array[25] = {10, 23, 33, 35, 55, 70, 71, 100, 110, - 123, 133, 135, 155, 170, 171, 1100, 1110, 1123, - 1133, 1135, 1155, 1170, 1171, 11100, 12310}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 10); - ASSERT_LE(array[1], 23); - ASSERT_LE(array[2], 33); - ASSERT_LE(array[3], 35); - ASSERT_LE(array[4], 55); - ASSERT_LE(array[5], 70); - ASSERT_LE(array[6], 71); - ASSERT_LE(array[7], 100); - ASSERT_LE(array[8], 110); - ASSERT_LE(array[9], 123); - ASSERT_LE(array[10], 133); - ASSERT_LE(array[11], 135); - ASSERT_LE(array[12], 155); - ASSERT_LE(array[13], 170); - ASSERT_LE(array[14], 171); - ASSERT_LE(array[15], 1100); - ASSERT_LE(array[16], 1110); - ASSERT_LE(array[17], 1123); - ASSERT_LE(array[18], 1133); - ASSERT_LE(array[19], 1135); - ASSERT_LE(array[20], 1155); - ASSERT_LE(array[21], 1170); - ASSERT_LE(array[22], 1171); - ASSERT_LE(array[23], 11100); - ASSERT_LE(array[24], 12310); -} - -TEST(LlvmLibcQsortTest, ReverseSortedArray) { - int array[25] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - for (int i = 0; i < int(ARRAY_SIZE - 1); ++i) - ASSERT_LE(array[i], i + 1); -} - -TEST(LlvmLibcQsortTest, AllEqualElements) { - int array[25] = {100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - for (size_t i = 0; i < ARRAY_SIZE - 1; ++i) - ASSERT_LE(array[i], 100); -} - -TEST(LlvmLibcQsortTest, UnsortedArray1) { - int array[25] = {10, 23, 8, 35, 55, 45, 40, 100, 110, 123, 90, 80, 70, - 60, 171, 11, 1, -1, -5, -10, 1155, 1170, 1171, 12, -100}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], -100); - ASSERT_LE(array[1], -10); - ASSERT_LE(array[2], -5); - ASSERT_LE(array[3], -1); - ASSERT_LE(array[4], 1); - ASSERT_LE(array[5], 8); - ASSERT_LE(array[6], 10); - ASSERT_LE(array[7], 11); - ASSERT_LE(array[8], 12); - ASSERT_LE(array[9], 23); - ASSERT_LE(array[10], 35); - ASSERT_LE(array[11], 40); - ASSERT_LE(array[12], 45); - ASSERT_LE(array[13], 55); - ASSERT_LE(array[14], 60); - ASSERT_LE(array[15], 70); - ASSERT_LE(array[16], 80); - ASSERT_LE(array[17], 90); - ASSERT_LE(array[18], 100); - ASSERT_LE(array[19], 110); - ASSERT_LE(array[20], 123); - ASSERT_LE(array[21], 171); - ASSERT_LE(array[22], 1155); - ASSERT_LE(array[23], 1170); - ASSERT_LE(array[24], 1171); -} - -TEST(LlvmLibcQsortTest, UnsortedArray2) { - int array[7] = {10, 40, 45, 55, 35, 23, 60}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 10); - ASSERT_LE(array[1], 23); - ASSERT_LE(array[2], 35); - ASSERT_LE(array[3], 40); - ASSERT_LE(array[4], 45); - ASSERT_LE(array[5], 55); - ASSERT_LE(array[6], 60); -} - -TEST(LlvmLibcQsortTest, UnsortedArrayDuplicateElements1) { - int array[6] = {10, 10, 20, 20, 5, 5}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 5); - ASSERT_LE(array[1], 5); - ASSERT_LE(array[2], 10); - ASSERT_LE(array[3], 10); - ASSERT_LE(array[4], 20); - ASSERT_LE(array[5], 20); -} - -TEST(LlvmLibcQsortTest, UnsortedArrayDuplicateElements2) { - int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 10); - ASSERT_LE(array[1], 10); - ASSERT_LE(array[2], 10); - ASSERT_LE(array[3], 10); - ASSERT_LE(array[4], 20); - ASSERT_LE(array[5], 20); - ASSERT_LE(array[6], 21); - ASSERT_LE(array[7], 21); - ASSERT_LE(array[8], 21); - ASSERT_LE(array[9], 21); -} - -TEST(LlvmLibcQsortTest, UnsortedArrayDuplicateElements3) { - int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 20); - ASSERT_LE(array[1], 20); - ASSERT_LE(array[2], 21); - ASSERT_LE(array[3], 21); - ASSERT_LE(array[4], 21); - ASSERT_LE(array[5], 21); - ASSERT_LE(array[6], 30); - ASSERT_LE(array[7], 30); - ASSERT_LE(array[8], 30); - ASSERT_LE(array[9], 30); -} - -TEST(LlvmLibcQsortTest, UnsortedThreeElementArray1) { - int array[3] = {14999024, 0, 3}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 0); - ASSERT_LE(array[1], 3); - ASSERT_LE(array[2], 14999024); -} - -TEST(LlvmLibcQsortTest, UnsortedThreeElementArray2) { - int array[3] = {3, 14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 0); - ASSERT_LE(array[1], 3); - ASSERT_LE(array[2], 14999024); -} - -TEST(LlvmLibcQsortTest, UnsortedThreeElementArray3) { - int array[3] = {3, 0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 0); - ASSERT_LE(array[1], 3); - ASSERT_LE(array[2], 14999024); -} - -TEST(LlvmLibcQsortTest, SameElementThreeElementArray) { - int array[3] = {12345, 12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 12345); - ASSERT_LE(array[1], 12345); - ASSERT_LE(array[2], 12345); -} - -TEST(LlvmLibcQsortTest, UnsortedTwoElementArray1) { - int array[2] = {14999024, 0}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 0); - ASSERT_LE(array[1], 14999024); -} - -TEST(LlvmLibcQsortTest, UnsortedTwoElementArray2) { - int array[2] = {0, 14999024}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 0); - ASSERT_LE(array[1], 14999024); -} - -TEST(LlvmLibcQsortTest, SameElementTwoElementArray) { - int array[2] = {12345, 12345}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], 12345); - ASSERT_LE(array[1], 12345); -} - -TEST(LlvmLibcQSortTest, SingleElementArray) { - constexpr int ELEM = 12345; - int array[1] = {ELEM}; - constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int); - - LIBC_NAMESPACE::qsort(array, ARRAY_SIZE, sizeof(int), int_compare); - - ASSERT_LE(array[0], ELEM); -} +LIST_SORTING_TESTS(Qsort, sort); diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp new file mode 100644 index 00000000000000..d6bf77ebfd40d7 --- /dev/null +++ b/libc/test/src/stdlib/quick_sort_test.cpp @@ -0,0 +1,16 @@ +//===-- Unittests for quick sort ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SortingTest.h" +#include "src/stdlib/quick_sort.h" + +void sort(const LIBC_NAMESPACE::internal::Array &array) { + LIBC_NAMESPACE::internal::quick_sort(array); +} + +LIST_SORTING_TESTS(QuickSort, sort); diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp index 2c7524943c0e6d..978561f31a2961 100644 --- a/libc/test/src/string/memory_utils/op_tests.cpp +++ b/libc/test/src/string/memory_utils/op_tests.cpp @@ -192,6 +192,13 @@ TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) { } } +#ifdef LIBC_TARGET_ARCH_IS_X86_64 +// Prevent GCC warning due to ignored __aligned__ attributes when passing x86 +// SIMD types as template arguments. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif // LIBC_TARGET_ARCH_IS_X86_64 + using BcmpImplementations = testing::TypeList< #ifdef LIBC_TARGET_ARCH_IS_X86_64 #ifdef __SSE4_1__ @@ -224,6 +231,10 @@ using BcmpImplementations = testing::TypeList< generic::BcmpSequence, // generic::Bcmp>; +#ifdef LIBC_TARGET_ARCH_IS_X86_64 +#pragma GCC diagnostic pop +#endif // LIBC_TARGET_ARCH_IS_X86_64 + // Adapt CheckBcmp signature to op implementation signatures. template int CmpAdaptor(cpp::span p1, cpp::span p2, size_t size) { @@ -275,6 +286,13 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) { } } +#ifdef LIBC_TARGET_ARCH_IS_X86_64 +// Prevent GCC warning due to ignored __aligned__ attributes when passing x86 +// SIMD types as template arguments. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif // LIBC_TARGET_ARCH_IS_X86_64 + using MemcmpImplementations = testing::TypeList< #ifdef LIBC_TARGET_ARCH_IS_X86_64 #ifdef __SSE2__ @@ -304,6 +322,10 @@ using MemcmpImplementations = testing::TypeList< generic::MemcmpSequence, generic::Memcmp>; +#ifdef LIBC_TARGET_ARCH_IS_X86_64 +#pragma GCC diagnostic pop +#endif // LIBC_TARGET_ARCH_IS_X86_64 + TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) { using Impl = ParamType; constexpr size_t kSize = Impl::SIZE; diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index cabf42ba9b0b2a..3cc404b9c86fc5 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -1,6 +1,6 @@ add_custom_target(libc_wchar_unittests) -add_libc_unittest( +add_libc_test( btowc_test SUITE libc_wchar_unittests @@ -11,7 +11,7 @@ add_libc_unittest( libc.src.wchar.btowc ) -add_libc_unittest( +add_libc_test( wctob_test SUITE libc_wchar_unittests diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 7cd20acc6ff4a0..1e347d043ef692 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -446,6 +446,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_linalg`` *unimplemented* ---------------------------------------------------------- ----------------- + ``__cpp_lib_mdspan`` ``202406L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_optional_range_support`` *unimplemented* ---------------------------------------------------------- ----------------- ``__cpp_lib_out_ptr`` *unimplemented* diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index a4cf411d2a6c13..e6d8acb74aeb20 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -54,6 +54,7 @@ Implemented Papers - P2713R1 - Escaping improvements in ``std::format`` - P2231R1 - Missing ``constexpr`` in ``std::optional`` and ``std::variant`` - P0019R8 - ``std::atomic_ref`` +- P2389R2 - Alias template ``dims`` for the ``extents`` of ``mdspan`` Improvements and New Features ----------------------------- @@ -134,6 +135,12 @@ Deprecations and Removals `std-allocator-const ` enabled. +- When configuring libc++ with localization or threads disabled, the library no longer emits an error when + trying to ``#include `` and other such headers. Instead, those headers have no content. This is + consistent with the behavior for all other libc++ carve-outs like filesystem, wide characters, a source + of randomness, and others. Users that were checking whether including a header would fail (e.g. via a script + or CMake's ``try_compile`` will experience a change in behavior). + Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index 540c6a8dd4f477..f9a70aee1bf46a 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -53,7 +53,7 @@ "`4025 `__","Move assignment operator of ``std::expected`` should not be conditionally deleted","Tokyo March 2024","","","" "`4030 `__","Clarify whether arithmetic expressions in ``[numeric.sat.func]`` are mathematical or C++","Tokyo March 2024","|Nothing To Do|","","" "`4031 `__","``bad_expected_access`` member functions should be ``noexcept``","Tokyo March 2024","|Complete|","16.0","" -"`4035 `__","``single_view`` should provide ``empty``","Tokyo March 2024","","","|ranges|" +"`4035 `__","``single_view`` should provide ``empty``","Tokyo March 2024","|Complete|","19.0","|ranges|" "`4036 `__","``__alignof_is_defined`` is only implicitly specified in C++ and not yet deprecated","Tokyo March 2024","","","" "`4037 `__","Static data members of ``ctype_base`` are not yet required to be usable in constant expressions","Tokyo March 2024","","","" "`4038 `__","``std::text_encoding::aliases_view`` should have constexpr iterators","Tokyo March 2024","","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index aea413dd335886..2c498f336b125f 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -65,7 +65,7 @@ "","","","","","","" "`P2747R2 `__","CWG","``constexpr`` placement new","St. Louis June 2024","","","" "`P2997R1 `__","LWG","Removing the common reference requirement from the indirectly invocable concepts","St. Louis June 2024","","","" -"`P2389R2 `__","LWG","``dextents`` Index Type Parameter","St. Louis June 2024","","","" +"`P2389R2 `__","LWG","``dextents`` Index Type Parameter","St. Louis June 2024","|Complete|","19.0","" "`P3168R2 `__","LWG","Give ``std::optional`` Range Support","St. Louis June 2024","","","|ranges|" "`P3217R0 `__","LWG","Adjoints to 'Enabling list-initialization for algorithms': find_last","St. Louis June 2024","","","" "`P2985R0 `__","LWG","A type trait for detecting virtual base classes","St. Louis June 2024","","","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 07dd25604a9c76..cd64fe91449c28 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -777,8 +777,6 @@ set(files __type_traits/is_implicitly_default_constructible.h __type_traits/is_integral.h __type_traits/is_literal_type.h - __type_traits/is_member_function_pointer.h - __type_traits/is_member_object_pointer.h __type_traits/is_member_pointer.h __type_traits/is_nothrow_assignable.h __type_traits/is_nothrow_constructible.h diff --git a/libcxx/include/__chrono/year_month_day.h b/libcxx/include/__chrono/year_month_day.h index 75884f3654d870..b06c0be03e0de4 100644 --- a/libcxx/include/__chrono/year_month_day.h +++ b/libcxx/include/__chrono/year_month_day.h @@ -239,33 +239,11 @@ operator==(const year_month_day_last& __lhs, const year_month_day_last& __rhs) n return __lhs.year() == __rhs.year() && __lhs.month_day_last() == __rhs.month_day_last(); } -_LIBCPP_HIDE_FROM_ABI inline constexpr bool -operator!=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { - return !(__lhs == __rhs); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr bool -operator<(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { - if (__lhs.year() < __rhs.year()) - return true; - if (__lhs.year() > __rhs.year()) - return false; - return __lhs.month_day_last() < __rhs.month_day_last(); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr bool -operator>(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { - return __rhs < __lhs; -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr bool -operator<=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { - return !(__rhs < __lhs); -} - -_LIBCPP_HIDE_FROM_ABI inline constexpr bool -operator>=(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { - return !(__lhs < __rhs); +_LIBCPP_HIDE_FROM_ABI inline constexpr strong_ordering +operator<=>(const year_month_day_last& __lhs, const year_month_day_last& __rhs) noexcept { + if (auto __c = __lhs.year() <=> __rhs.year(); __c != 0) + return __c; + return __lhs.month_day_last() <=> __rhs.month_day_last(); } _LIBCPP_HIDE_FROM_ABI inline constexpr year_month_day_last operator/(const year_month& __lhs, last_spec) noexcept { diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h index fea0decd8c6af7..95082ef3d11ac9 100644 --- a/libcxx/include/__mdspan/extents.h +++ b/libcxx/include/__mdspan/extents.h @@ -454,6 +454,12 @@ struct __make_dextents< _IndexType, 0, extents<_IndexType, _ExtentsPack...>> { template using dextents = typename __mdspan_detail::__make_dextents<_IndexType, _Rank>::type; +# if _LIBCPP_STD_VER >= 26 +// [mdspan.extents.dims], alias template `dims` +template +using dims = dextents<_IndexType, _Rank>; +# endif + // Deduction guide for extents # if _LIBCPP_STD_VER >= 26 template diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h index f91c7c35263676..45244f34994d74 100644 --- a/libcxx/include/__ranges/single_view.h +++ b/libcxx/include/__ranges/single_view.h @@ -70,6 +70,8 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS single_view : public view_interface(); } diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h index a0281f5b200640..71db32ae6a3cef 100644 --- a/libcxx/include/__type_traits/invoke.h +++ b/libcxx/include/__type_traits/invoke.h @@ -17,8 +17,7 @@ #include <__type_traits/integral_constant.h> #include <__type_traits/is_base_of.h> #include <__type_traits/is_core_convertible.h> -#include <__type_traits/is_member_function_pointer.h> -#include <__type_traits/is_member_object_pointer.h> +#include <__type_traits/is_member_pointer.h> #include <__type_traits/is_reference_wrapper.h> #include <__type_traits/is_same.h> #include <__type_traits/is_void.h> diff --git a/libcxx/include/__type_traits/is_fundamental.h b/libcxx/include/__type_traits/is_fundamental.h index 57206e0d9deb1f..55f8e41f75f457 100644 --- a/libcxx/include/__type_traits/is_fundamental.h +++ b/libcxx/include/__type_traits/is_fundamental.h @@ -34,7 +34,7 @@ inline constexpr bool is_fundamental_v = __is_fundamental(_Tp); template struct _LIBCPP_TEMPLATE_VIS is_fundamental - : public integral_constant::value || __is_nullptr_t<_Tp>::value || is_arithmetic<_Tp>::value> {}; + : public integral_constant::value || __is_null_pointer_v<_Tp> || is_arithmetic<_Tp>::value> {}; # if _LIBCPP_STD_VER >= 17 template diff --git a/libcxx/include/__type_traits/is_member_function_pointer.h b/libcxx/include/__type_traits/is_member_function_pointer.h deleted file mode 100644 index 037d5ca04ab0b2..00000000000000 --- a/libcxx/include/__type_traits/is_member_function_pointer.h +++ /dev/null @@ -1,31 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_MEMBER_FUNCTION_POINTER_H -#define _LIBCPP___TYPE_TRAITS_IS_MEMBER_FUNCTION_POINTER_H - -#include <__config> -#include <__type_traits/integral_constant.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -template -struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : _BoolConstant<__is_member_function_pointer(_Tp)> {}; - -# if _LIBCPP_STD_VER >= 17 -template -inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp); -# endif - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_MEMBER_FUNCTION_POINTER_H diff --git a/libcxx/include/__type_traits/is_member_object_pointer.h b/libcxx/include/__type_traits/is_member_object_pointer.h deleted file mode 100644 index 555794bfe03876..00000000000000 --- a/libcxx/include/__type_traits/is_member_object_pointer.h +++ /dev/null @@ -1,31 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_MEMBER_OBJECT_POINTER_H -#define _LIBCPP___TYPE_TRAITS_IS_MEMBER_OBJECT_POINTER_H - -#include <__config> -#include <__type_traits/integral_constant.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -template -struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : _BoolConstant<__is_member_object_pointer(_Tp)> {}; - -# if _LIBCPP_STD_VER >= 17 -template -inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_Tp); -# endif - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_MEMBER_FUNCTION_POINTER_H diff --git a/libcxx/include/__type_traits/is_member_pointer.h b/libcxx/include/__type_traits/is_member_pointer.h index 149634fde75846..cc125e318cf919 100644 --- a/libcxx/include/__type_traits/is_member_pointer.h +++ b/libcxx/include/__type_traits/is_member_pointer.h @@ -21,9 +21,21 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct _LIBCPP_TEMPLATE_VIS is_member_pointer : _BoolConstant<__is_member_pointer(_Tp)> {}; +template +struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : _BoolConstant<__is_member_object_pointer(_Tp)> {}; + +template +struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : _BoolConstant<__is_member_function_pointer(_Tp)> {}; + # if _LIBCPP_STD_VER >= 17 template inline constexpr bool is_member_pointer_v = __is_member_pointer(_Tp); + +template +inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_Tp); + +template +inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp); # endif _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__type_traits/is_null_pointer.h b/libcxx/include/__type_traits/is_null_pointer.h index c666f5f24759c9..9f5697e232684e 100644 --- a/libcxx/include/__type_traits/is_null_pointer.h +++ b/libcxx/include/__type_traits/is_null_pointer.h @@ -11,7 +11,6 @@ #include <__config> #include <__type_traits/integral_constant.h> -#include <__type_traits/remove_cv.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -21,20 +20,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct __is_nullptr_t_impl : public false_type {}; -template <> -struct __is_nullptr_t_impl : public true_type {}; - -template -struct _LIBCPP_TEMPLATE_VIS __is_nullptr_t : public __is_nullptr_t_impl<__remove_cv_t<_Tp> > {}; +inline const bool __is_null_pointer_v = __is_same(__remove_cv(_Tp), nullptr_t); #if _LIBCPP_STD_VER >= 14 template -struct _LIBCPP_TEMPLATE_VIS is_null_pointer : public __is_nullptr_t_impl<__remove_cv_t<_Tp> > {}; +struct _LIBCPP_TEMPLATE_VIS is_null_pointer : integral_constant> {}; # if _LIBCPP_STD_VER >= 17 template -inline constexpr bool is_null_pointer_v = is_null_pointer<_Tp>::value; +inline constexpr bool is_null_pointer_v = __is_null_pointer_v<_Tp>; # endif #endif // _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/__type_traits/is_scalar.h b/libcxx/include/__type_traits/is_scalar.h index 15f1c71554f220..455200de472089 100644 --- a/libcxx/include/__type_traits/is_scalar.h +++ b/libcxx/include/__type_traits/is_scalar.h @@ -49,7 +49,7 @@ struct _LIBCPP_TEMPLATE_VIS is_scalar bool, is_arithmetic<_Tp>::value || is_member_pointer<_Tp>::value || is_pointer<_Tp>::value || - __is_nullptr_t<_Tp>::value || + __is_null_pointer_v<_Tp> || __is_block<_Tp>::value || is_enum<_Tp>::value> {}; // clang-format on diff --git a/libcxx/include/barrier b/libcxx/include/barrier index 12608e17d8f6db..edee181273e248 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -47,30 +47,28 @@ namespace std #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif - -#include <__assert> -#include <__atomic/atomic_base.h> -#include <__atomic/memory_order.h> -#include <__memory/unique_ptr.h> -#include <__thread/poll_with_backoff.h> -#include <__thread/timed_backoff_policy.h> -#include <__utility/move.h> -#include -#include -#include -#include - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) + +# include <__assert> +# include <__atomic/atomic_base.h> +# include <__atomic/memory_order.h> +# include <__memory/unique_ptr.h> +# include <__thread/poll_with_backoff.h> +# include <__thread/timed_backoff_policy.h> +# include <__utility/move.h> +# include +# include +# include +# include + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> -#if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 _LIBCPP_BEGIN_NAMESPACE_STD @@ -78,7 +76,7 @@ struct __empty_completion { inline _LIBCPP_HIDE_FROM_ABI void operator()() noexcept {} }; -# ifndef _LIBCPP_HAS_NO_TREE_BARRIER +# ifndef _LIBCPP_HAS_NO_TREE_BARRIER /* @@ -152,7 +150,7 @@ public: } }; -# else +# else /* @@ -253,7 +251,7 @@ public: } }; -# endif // !_LIBCPP_HAS_NO_TREE_BARRIER +# endif // !_LIBCPP_HAS_NO_TREE_BARRIER template class _LIBCPP_DEPRECATED_ATOMIC_SYNC barrier { @@ -265,7 +263,7 @@ public: static _LIBCPP_HIDE_FROM_ABI constexpr ptrdiff_t max() noexcept { return __barrier_base<_CompletionF>::max(); } _LIBCPP_AVAILABILITY_SYNC - _LIBCPP_HIDE_FROM_ABI explicit barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) + _LIBCPP_HIDE_FROM_ABI explicit barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) : __b_(__count, std::move(__completion)) { _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN( __count >= 0, @@ -292,10 +290,12 @@ public: _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_STD_VER >= 14 +# endif // _LIBCPP_STD_VER >= 14 _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_THREADS) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/future b/libcxx/include/future index dea73dc6389bc7..0be32620139e37 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -364,44 +364,42 @@ template struct uses_allocator, Alloc>; #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif - -#include <__assert> -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__exception/exception_ptr.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory/allocator_arg_t.h> -#include <__memory/allocator_destructor.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/pointer_traits.h> -#include <__memory/shared_ptr.h> -#include <__memory/unique_ptr.h> -#include <__memory/uses_allocator.h> -#include <__system_error/error_category.h> -#include <__system_error/error_code.h> -#include <__system_error/error_condition.h> -#include <__type_traits/aligned_storage.h> -#include <__type_traits/strip_signature.h> -#include <__utility/auto_cast.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include -#include -#include -#include -#include - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) + +# include <__assert> +# include <__chrono/duration.h> +# include <__chrono/time_point.h> +# include <__exception/exception_ptr.h> +# include <__memory/addressof.h> +# include <__memory/allocator.h> +# include <__memory/allocator_arg_t.h> +# include <__memory/allocator_destructor.h> +# include <__memory/allocator_traits.h> +# include <__memory/compressed_pair.h> +# include <__memory/pointer_traits.h> +# include <__memory/shared_ptr.h> +# include <__memory/unique_ptr.h> +# include <__memory/uses_allocator.h> +# include <__system_error/error_category.h> +# include <__system_error/error_code.h> +# include <__system_error/error_condition.h> +# include <__type_traits/aligned_storage.h> +# include <__type_traits/strip_signature.h> +# include <__utility/auto_cast.h> +# include <__utility/forward.h> +# include <__utility/move.h> +# include +# include +# include +# include +# include + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -413,16 +411,16 @@ _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(future_errc) template <> struct _LIBCPP_TEMPLATE_VIS is_error_code_enum : public true_type {}; -#ifdef _LIBCPP_CXX03_LANG +# ifdef _LIBCPP_CXX03_LANG template <> struct _LIBCPP_TEMPLATE_VIS is_error_code_enum : public true_type {}; -#endif +# endif // enum class launch _LIBCPP_DECLARE_STRONG_ENUM(launch){async = 1, deferred = 2, any = async | deferred}; _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(launch) -#ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG typedef underlying_type::type __launch_underlying_type; @@ -457,7 +455,7 @@ inline _LIBCPP_HIDE_FROM_ABI launch& operator^=(launch& __x, launch __y) { return __x; } -#endif // !_LIBCPP_CXX03_LANG +# endif // !_LIBCPP_CXX03_LANG // enum class future_status _LIBCPP_DECLARE_STRONG_ENUM(future_status){ready, timeout, deferred}; @@ -484,9 +482,9 @@ class _LIBCPP_EXPORTED_FROM_ABI future_error : public logic_error { friend class promise; public: -#if _LIBCPP_STD_VER >= 17 +# if _LIBCPP_STD_VER >= 17 _LIBCPP_HIDE_FROM_ABI explicit future_error(future_errc __ec) : future_error(std::make_error_code(__ec)) {} -#endif +# endif _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; } @@ -496,12 +494,12 @@ public: // Declared above std::future_error void __throw_future_error(future_errc __ev) { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw future_error(make_error_code(__ev)); -#else +# else (void)__ev; _LIBCPP_VERBOSE_ABORT("future_error was thrown in -fno-exceptions mode"); -#endif +# endif } class _LIBCPP_EXPORTED_FROM_ABI __assoc_sub_state : public __shared_count { @@ -775,15 +773,15 @@ inline __deferred_assoc_state<_Rp, _Fp>::__deferred_assoc_state(_Fp&& __f) : __f template void __deferred_assoc_state<_Rp, _Fp>::__execute() { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS this->set_value(__func_()); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { this->set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -805,16 +803,16 @@ inline __deferred_assoc_state::__deferred_assoc_state(_Fp&& __f) : __ template void __deferred_assoc_state::__execute() { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __func_(); this->set_value(); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { this->set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -836,15 +834,15 @@ inline __async_assoc_state<_Rp, _Fp>::__async_assoc_state(_Fp&& __f) : __func_(s template void __async_assoc_state<_Rp, _Fp>::__execute() { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS this->set_value(__func_()); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { this->set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -872,16 +870,16 @@ inline __async_assoc_state::__async_assoc_state(_Fp&& __f) : __func_( template void __async_assoc_state::__execute() { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __func_(); this->set_value(); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { this->set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -1386,7 +1384,7 @@ template class __packaged_task_base<_Rp(_ArgTypes...)> { public: _LIBCPP_HIDE_FROM_ABI __packaged_task_base() {} - __packaged_task_base(const __packaged_task_base&) = delete; + __packaged_task_base(const __packaged_task_base&) = delete; __packaged_task_base& operator=(const __packaged_task_base&) = delete; _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual ~__packaged_task_base() {} @@ -1648,15 +1646,15 @@ void packaged_task<_Rp(_ArgTypes...)>::operator()(_ArgTypes... __args) { __throw_future_error(future_errc::no_state); if (__p_.__state_->__has_value()) __throw_future_error(future_errc::promise_already_satisfied); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __p_.set_value(__f_(std::forward<_ArgTypes>(__args)...)); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { __p_.set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -1665,15 +1663,15 @@ void packaged_task<_Rp(_ArgTypes...)>::make_ready_at_thread_exit(_ArgTypes... __ __throw_future_error(future_errc::no_state); if (__p_.__state_->__has_value()) __throw_future_error(future_errc::promise_already_satisfied); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __p_.set_value_at_thread_exit(__f_(std::forward<_ArgTypes>(__args)...)); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { __p_.set_exception_at_thread_exit(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -1732,7 +1730,7 @@ public: _LIBCPP_HIDE_FROM_ABI void reset(); }; -#if _LIBCPP_STD_VER >= 17 +# if _LIBCPP_STD_VER >= 17 template packaged_task(_Rp (*)(_Args...)) -> packaged_task<_Rp(_Args...)>; @@ -1740,7 +1738,7 @@ packaged_task(_Rp (*)(_Args...)) -> packaged_task<_Rp(_Args...)>; template ::type> packaged_task(_Fp) -> packaged_task<_Stripped>; -#endif +# endif template void packaged_task::operator()(_ArgTypes... __args) { @@ -1748,16 +1746,16 @@ void packaged_task::operator()(_ArgTypes... __args) { __throw_future_error(future_errc::no_state); if (__p_.__state_->__has_value()) __throw_future_error(future_errc::promise_already_satisfied); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __f_(std::forward<_ArgTypes>(__args)...); __p_.set_value(); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { __p_.set_exception(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -1766,16 +1764,16 @@ void packaged_task::make_ready_at_thread_exit(_ArgTypes... _ __throw_future_error(future_errc::no_state); if (__p_.__state_->__has_value()) __throw_future_error(future_errc::promise_already_satisfied); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS __f_(std::forward<_ArgTypes>(__args)...); __p_.set_value_at_thread_exit(); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { __p_.set_exception_at_thread_exit(current_exception()); } -#endif // _LIBCPP_HAS_NO_EXCEPTIONS +# endif // _LIBCPP_HAS_NO_EXCEPTIONS } template @@ -1809,7 +1807,7 @@ _LIBCPP_HIDE_FROM_ABI future<_Rp> __make_async_assoc_state(_Fp&& __f) { return future<_Rp>(__h.get()); } -#ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG template class _LIBCPP_HIDDEN __async_func { @@ -1845,18 +1843,18 @@ async(launch __policy, _Fp&& __f, _Args&&... __args) { typedef __async_func<__decay_t<_Fp>, __decay_t<_Args>...> _BF; typedef typename _BF::_Rp _Rp; -# ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -# endif +# endif if (__does_policy_contain(__policy, launch::async)) return std::__make_async_assoc_state<_Rp>( _BF(_LIBCPP_AUTO_CAST(std::forward<_Fp>(__f)), _LIBCPP_AUTO_CAST(std::forward<_Args>(__args))...)); -# ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { if (__policy == launch::async) throw; } -# endif +# endif if (__does_policy_contain(__policy, launch::deferred)) return std::__make_deferred_assoc_state<_Rp>( @@ -1870,7 +1868,7 @@ async(_Fp&& __f, _Args&&... __args) { return std::async(launch::any, std::forward<_Fp>(__f), std::forward<_Args>(__args)...); } -#endif // C++03 +# endif // C++03 // shared_future @@ -2047,6 +2045,8 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_THREADS) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 # include #endif diff --git a/libcxx/include/ios b/libcxx/include/ios index a653af005a18d1..0a813c07721fee 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -213,36 +213,34 @@ storage-class-specifier const error_category& iostream_category() noexcept; #include <__config> -#if defined(_LIBCPP_HAS_NO_LOCALIZATION) -# error "The iostreams library is not supported since libc++ has been configured without support for localization." -#endif - -#include <__fwd/ios.h> -#include <__ios/fpos.h> -#include <__locale> -#include <__system_error/error_category.h> -#include <__system_error/error_code.h> -#include <__system_error/error_condition.h> -#include <__system_error/system_error.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) + +# include <__fwd/ios.h> +# include <__ios/fpos.h> +# include <__locale> +# include <__system_error/error_category.h> +# include <__system_error/error_code.h> +# include <__system_error/error_condition.h> +# include <__system_error/system_error.h> +# include <__utility/swap.h> +# include <__verbose_abort> +# include // standard-mandated includes // [ios.syn] -#include +# include -#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include <__atomic/atomic.h> // for __xindex_ -#endif +# if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) +# include <__atomic/atomic.h> // for __xindex_ +# endif -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -285,20 +283,20 @@ public: static const openmode in = 0x08; static const openmode out = 0x10; static const openmode trunc = 0x20; -#if _LIBCPP_STD_VER >= 23 +# if _LIBCPP_STD_VER >= 23 static const openmode noreplace = 0x40; -#endif +# endif enum seekdir { beg, cur, end }; -#if _LIBCPP_STD_VER <= 14 +# if _LIBCPP_STD_VER <= 14 typedef iostate io_state; typedef openmode open_mode; typedef seekdir seek_dir; typedef std::streamoff streamoff; typedef std::streampos streampos; -#endif +# endif class _LIBCPP_EXPORTED_FROM_ABI Init; @@ -398,11 +396,11 @@ private: size_t __event_cap_; // TODO(EricWF): Enable this for both Clang and GCC. Currently it is only // enabled with clang. -#if defined(_LIBCPP_HAS_C_ATOMIC_IMP) && !defined(_LIBCPP_HAS_NO_THREADS) +# if defined(_LIBCPP_HAS_C_ATOMIC_IMP) && !defined(_LIBCPP_HAS_NO_THREADS) static atomic __xindex_; -#else +# else static int __xindex_; -#endif +# endif long* __iarray_; size_t __iarray_size_; size_t __iarray_cap_; @@ -418,10 +416,10 @@ _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(io_errc) template <> struct _LIBCPP_TEMPLATE_VIS is_error_code_enum : public true_type {}; -#ifdef _LIBCPP_CXX03_LANG +# ifdef _LIBCPP_CXX03_LANG template <> struct _LIBCPP_TEMPLATE_VIS is_error_code_enum : public true_type {}; -#endif +# endif _LIBCPP_EXPORTED_FROM_ABI const error_category& iostream_category() _NOEXCEPT; @@ -442,11 +440,11 @@ public: }; _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw ios_base::failure(__msg); -#else +# else _LIBCPP_VERBOSE_ABORT("ios_base::failure was thrown in -fno-exceptions mode with message \"%s\"", __msg); -#endif +# endif } class _LIBCPP_EXPORTED_FROM_ABI ios_base::Init { @@ -535,13 +533,13 @@ public: static_assert(is_same<_CharT, typename traits_type::char_type>::value, "traits_type::char_type must be the same type as CharT"); -#ifdef _LIBCPP_CXX03_LANG +# ifdef _LIBCPP_CXX03_LANG // Preserve the ability to compare with literal 0, // and implicitly convert to bool, but not implicitly convert to int. _LIBCPP_HIDE_FROM_ABI operator void*() const { return fail() ? nullptr : (void*)this; } -#else +# else _LIBCPP_HIDE_FROM_ABI explicit operator bool() const { return !fail(); } -#endif +# endif _LIBCPP_HIDE_FROM_ABI bool operator!() const { return fail(); } _LIBCPP_HIDE_FROM_ABI iostate rdstate() const { return ios_base::rdstate(); } @@ -704,9 +702,9 @@ inline _LIBCPP_HIDE_FROM_ABI void basic_ios<_CharT, _Traits>::set_rdbuf(basic_st extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ios; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ios; -#endif +# endif _LIBCPP_HIDE_FROM_ABI inline ios_base& boolalpha(ios_base& __str) { __str.setf(ios_base::boolalpha); @@ -832,6 +830,8 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_LOCALIZATION) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/latch b/libcxx/include/latch index da8dae149c79f3..81d6028a9c2ce1 100644 --- a/libcxx/include/latch +++ b/libcxx/include/latch @@ -42,26 +42,24 @@ namespace std #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) -#include <__assert> -#include <__atomic/atomic_base.h> -#include <__atomic/atomic_sync.h> -#include <__atomic/memory_order.h> -#include -#include -#include +# include <__assert> +# include <__atomic/atomic_base.h> +# include <__atomic/atomic_sync.h> +# include <__atomic/memory_order.h> +# include +# include +# include -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> -#if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 _LIBCPP_BEGIN_NAMESPACE_STD @@ -118,10 +116,12 @@ private: _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_STD_VER >= 14 +# endif // _LIBCPP_STD_VER >= 14 _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_THREADS) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include #endif diff --git a/libcxx/include/locale b/libcxx/include/locale index 19e81e110b69ca..dbec23a2c936df 100644 --- a/libcxx/include/locale +++ b/libcxx/include/locale @@ -187,67 +187,70 @@ template class messages_byname; */ -#include <__algorithm/copy.h> -#include <__algorithm/equal.h> -#include <__algorithm/find.h> -#include <__algorithm/max.h> -#include <__algorithm/reverse.h> -#include <__algorithm/unwrap_iter.h> -#include <__assert> #include <__config> -#include <__iterator/access.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/istreambuf_iterator.h> -#include <__iterator/ostreambuf_iterator.h> -#include <__locale> -#include <__memory/unique_ptr.h> -#include <__type_traits/make_unsigned.h> -#include -#include -#include -#include -#include -#include -#include -#include -#include + +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) + +# include <__algorithm/copy.h> +# include <__algorithm/equal.h> +# include <__algorithm/find.h> +# include <__algorithm/max.h> +# include <__algorithm/reverse.h> +# include <__algorithm/unwrap_iter.h> +# include <__assert> +# include <__iterator/access.h> +# include <__iterator/back_insert_iterator.h> +# include <__iterator/istreambuf_iterator.h> +# include <__iterator/ostreambuf_iterator.h> +# include <__locale> +# include <__memory/unique_ptr.h> +# include <__type_traits/make_unsigned.h> +# include +# include +# include +# include +# include +# include +# include +# include +# include // TODO: Fix __bsd_locale_defaults.h // NOLINTBEGIN(libcpp-robust-against-adl) -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) +# if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) // Most unix variants have catopen. These are the specific ones that don't. -# if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION) && !defined(__EMSCRIPTEN__) -# define _LIBCPP_HAS_CATOPEN 1 -# include +# if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION) && !defined(__EMSCRIPTEN__) +# define _LIBCPP_HAS_CATOPEN 1 +# include +# endif # endif -#endif -#ifdef _LIBCPP_LOCALE__L_EXTENSIONS -# include <__locale_dir/locale_base_api/bsd_locale_defaults.h> -#else -# include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h> -#endif +# ifdef _LIBCPP_LOCALE__L_EXTENSIONS +# include <__locale_dir/locale_base_api/bsd_locale_defaults.h> +# else +# include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h> +# endif -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD -#if defined(__APPLE__) || defined(__FreeBSD__) -# define _LIBCPP_GET_C_LOCALE 0 -#elif defined(__NetBSD__) -# define _LIBCPP_GET_C_LOCALE LC_C_LOCALE -#else -# define _LIBCPP_GET_C_LOCALE __cloc() +# if defined(__APPLE__) || defined(__FreeBSD__) +# define _LIBCPP_GET_C_LOCALE 0 +# elif defined(__NetBSD__) +# define _LIBCPP_GET_C_LOCALE LC_C_LOCALE +# else +# define _LIBCPP_GET_C_LOCALE __cloc() // Get the C locale object _LIBCPP_EXPORTED_FROM_ABI locale_t __cloc(); -# define __cloc_defined -#endif +# define __cloc_defined +# endif // __scan_keyword // Scans [__b, __e) until a match is found in the basic_strings range @@ -395,7 +398,7 @@ struct __num_get : protected __num_get_base { unsigned*& __g_end, unsigned& __dc, _CharT* __atoms); -#ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET +# ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET static string __stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep); static int __stage2_int_loop( _CharT __ct, @@ -409,7 +412,7 @@ struct __num_get : protected __num_get_base { unsigned*& __g_end, _CharT* __atoms); -#else +# else static string __stage2_int_prep(ios_base& __iob, _CharT& __thousands_sep) { locale __loc = __iob.getloc(); const numpunct<_CharT>& __np = use_facet >(__loc); @@ -444,10 +447,10 @@ private: (void)__atoms; return __src; } -#endif +# endif }; -#ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET +# ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET template string __num_get<_CharT>::__stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep) { locale __loc = __iob.getloc(); @@ -456,7 +459,7 @@ string __num_get<_CharT>::__stage2_int_prep(ios_base& __iob, _CharT* __atoms, _C __thousands_sep = __np.thousands_sep(); return __np.grouping(); } -#endif +# endif template string __num_get<_CharT>::__stage2_float_prep( @@ -471,16 +474,16 @@ string __num_get<_CharT>::__stage2_float_prep( template int -#ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET +# ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*& __a_end, unsigned& __dc, _CharT __thousands_sep, const string& __grouping, unsigned* __g, unsigned*& __g_end, _CharT* __atoms) -#else +# else __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*& __a_end, unsigned& __dc, _CharT __thousands_sep, const string& __grouping, unsigned* __g, unsigned*& __g_end, const _CharT* __atoms) -#endif +# endif { if (__a_end == __a && (__ct == __atoms[24] || __ct == __atoms[25])) { *__a_end++ = __ct == __atoms[24] ? '+' : '-'; @@ -579,9 +582,9 @@ int __num_get<_CharT>::__stage2_float_loop( } extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS num_get : public locale::facet, private __num_get<_CharT> { @@ -851,14 +854,14 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_signed( // Stage 2 char_type __thousands_sep; const int __atoms_size = __num_get_base::__int_chr_cnt; -#ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET +# ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET char_type __atoms1[__atoms_size]; const char_type* __atoms = this->__do_widen(__iob, __atoms1); string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); -#else +# else char_type __atoms[__atoms_size]; string __grouping = this->__stage2_int_prep(__iob, __atoms, __thousands_sep); -#endif +# endif string __buf; __buf.resize(__buf.capacity()); char* __a = &__buf[0]; @@ -900,14 +903,14 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_unsigned( // Stage 2 char_type __thousands_sep; const int __atoms_size = __num_get_base::__int_chr_cnt; -#ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET +# ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET char_type __atoms1[__atoms_size]; const char_type* __atoms = this->__do_widen(__iob, __atoms1); string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); -#else +# else char_type __atoms[__atoms_size]; string __grouping = this->__stage2_int_prep(__iob, __atoms, __thousands_sep); -#endif +# endif string __buf; __buf.resize(__buf.capacity()); char* __a = &__buf[0]; @@ -1050,9 +1053,9 @@ _InputIterator num_get<_CharT, _InputIterator>::do_get( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get; -#endif +# endif struct _LIBCPP_EXPORTED_FROM_ABI __num_put_base { protected: @@ -1168,9 +1171,9 @@ void __num_put<_CharT>::__widen_and_group_float( } extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_put; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS num_put : public locale::facet, private __num_put<_CharT> { @@ -1455,9 +1458,9 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob, char_ty } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_put; -#endif +# endif template _LIBCPP_HIDE_FROM_ABI int __get_up_to_n_digits( @@ -1522,7 +1525,7 @@ _LIBCPP_EXPORTED_FROM_ABI const string& __time_get_c_storage::__x() const; template <> _LIBCPP_EXPORTED_FROM_ABI const string& __time_get_c_storage::__X() const; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> _LIBCPP_EXPORTED_FROM_ABI const wstring* __time_get_c_storage::__weeks() const; template <> @@ -1537,7 +1540,7 @@ template <> _LIBCPP_EXPORTED_FROM_ABI const wstring& __time_get_c_storage::__x() const; template <> _LIBCPP_EXPORTED_FROM_ABI const wstring& __time_get_c_storage::__X() const; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS time_get : public locale::facet, public time_base, private __time_get_c_storage<_CharT> { @@ -1991,9 +1994,9 @@ _InputIterator time_get<_CharT, _InputIterator>::do_get( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get; -#endif +# endif class _LIBCPP_EXPORTED_FROM_ABI __time_get { protected: @@ -2029,31 +2032,32 @@ private: string_type __analyze(char __fmt, const ctype<_CharT>&); }; -#define _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION(_CharT) \ - template <> \ - _LIBCPP_EXPORTED_FROM_ABI time_base::dateorder __time_get_storage<_CharT>::__do_date_order() const; \ - template <> \ - _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const char*); \ - template <> \ - _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const string&); \ - template <> \ - _LIBCPP_EXPORTED_FROM_ABI void __time_get_storage<_CharT>::init(const ctype<_CharT>&); \ - template <> \ - _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::string_type __time_get_storage<_CharT>::__analyze( \ - char, const ctype<_CharT>&); \ - extern template _LIBCPP_EXPORTED_FROM_ABI time_base::dateorder __time_get_storage<_CharT>::__do_date_order() const; \ - extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const char*); \ - extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const string&); \ - extern template _LIBCPP_EXPORTED_FROM_ABI void __time_get_storage<_CharT>::init(const ctype<_CharT>&); \ - extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::string_type \ - __time_get_storage<_CharT>::__analyze(char, const ctype<_CharT>&); \ - /**/ +# define _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION(_CharT) \ + template <> \ + _LIBCPP_EXPORTED_FROM_ABI time_base::dateorder __time_get_storage<_CharT>::__do_date_order() const; \ + template <> \ + _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const char*); \ + template <> \ + _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const string&); \ + template <> \ + _LIBCPP_EXPORTED_FROM_ABI void __time_get_storage<_CharT>::init(const ctype<_CharT>&); \ + template <> \ + _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::string_type __time_get_storage<_CharT>::__analyze( \ + char, const ctype<_CharT>&); \ + extern template _LIBCPP_EXPORTED_FROM_ABI time_base::dateorder __time_get_storage<_CharT>::__do_date_order() \ + const; \ + extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const char*); \ + extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::__time_get_storage(const string&); \ + extern template _LIBCPP_EXPORTED_FROM_ABI void __time_get_storage<_CharT>::init(const ctype<_CharT>&); \ + extern template _LIBCPP_EXPORTED_FROM_ABI __time_get_storage<_CharT>::string_type \ + __time_get_storage<_CharT>::__analyze(char, const ctype<_CharT>&); \ + /**/ _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION(char) -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION(wchar_t) -#endif -#undef _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION +# endif +# undef _LIBCPP_TIME_GET_STORAGE_EXPLICIT_INSTANTIATION template > class _LIBCPP_TEMPLATE_VIS time_get_byname @@ -2086,9 +2090,9 @@ private: }; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_get_byname; -#endif +# endif class _LIBCPP_EXPORTED_FROM_ABI __time_put { locale_t __loc_; @@ -2099,9 +2103,9 @@ protected: __time_put(const string& __nm); ~__time_put(); void __do_put(char* __nb, char*& __ne, const tm* __tm, char __fmt, char __mod) const; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS void __do_put(wchar_t* __wb, wchar_t*& __we, const tm* __tm, char __fmt, char __mod) const; -#endif +# endif }; template > @@ -2175,9 +2179,9 @@ _OutputIterator time_put<_CharT, _OutputIterator>::do_put( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS time_put_byname : public time_put<_CharT, _OutputIterator> { @@ -2193,9 +2197,9 @@ protected: }; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS time_put_byname; -#endif +# endif // money_base @@ -2260,10 +2264,10 @@ const bool moneypunct<_CharT, _International>::intl; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct; -#endif +# endif // moneypunct_byname @@ -2318,14 +2322,14 @@ _LIBCPP_EXPORTED_FROM_ABI void moneypunct_byname::init(const char*); extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> _LIBCPP_EXPORTED_FROM_ABI void moneypunct_byname::init(const char*); template <> _LIBCPP_EXPORTED_FROM_ABI void moneypunct_byname::init(const char*); extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS moneypunct_byname; -#endif +# endif // money_get @@ -2386,9 +2390,9 @@ void __money_get<_CharT>::__gather_info( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_get; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS money_get : public locale::facet, private __money_get<_CharT> { @@ -2696,9 +2700,9 @@ _InputIterator money_get<_CharT, _InputIterator>::do_get( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_get; -#endif +# endif // money_put @@ -2874,9 +2878,9 @@ void __money_put<_CharT>::__format( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __money_put; -#endif +# endif template > class _LIBCPP_TEMPLATE_VIS money_put : public locale::facet, private __money_put<_CharT> { @@ -3020,9 +3024,9 @@ _OutputIterator money_put<_CharT, _OutputIterator>::do_put( } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS money_put; -#endif +# endif // messages @@ -3066,18 +3070,18 @@ locale::id messages<_CharT>::id; template typename messages<_CharT>::catalog messages<_CharT>::do_open(const basic_string& __nm, const locale&) const { -#ifdef _LIBCPP_HAS_CATOPEN +# ifdef _LIBCPP_HAS_CATOPEN return (catalog)catopen(__nm.c_str(), NL_CAT_LOCALE); -#else // !_LIBCPP_HAS_CATOPEN +# else // !_LIBCPP_HAS_CATOPEN (void)__nm; return -1; -#endif // _LIBCPP_HAS_CATOPEN +# endif // _LIBCPP_HAS_CATOPEN } template typename messages<_CharT>::string_type messages<_CharT>::do_get(catalog __c, int __set, int __msgid, const string_type& __dflt) const { -#ifdef _LIBCPP_HAS_CATOPEN +# ifdef _LIBCPP_HAS_CATOPEN string __ndflt; __narrow_to_utf8()( std::back_inserter(__ndflt), __dflt.c_str(), __dflt.c_str() + __dflt.size()); @@ -3087,27 +3091,27 @@ messages<_CharT>::do_get(catalog __c, int __set, int __msgid, const string_type& string_type __w; __widen_from_utf8()(std::back_inserter(__w), __n, __n + std::strlen(__n)); return __w; -#else // !_LIBCPP_HAS_CATOPEN +# else // !_LIBCPP_HAS_CATOPEN (void)__c; (void)__set; (void)__msgid; return __dflt; -#endif // _LIBCPP_HAS_CATOPEN +# endif // _LIBCPP_HAS_CATOPEN } template void messages<_CharT>::do_close(catalog __c) const { -#ifdef _LIBCPP_HAS_CATOPEN +# ifdef _LIBCPP_HAS_CATOPEN catclose((nl_catd)__c); -#else // !_LIBCPP_HAS_CATOPEN +# else // !_LIBCPP_HAS_CATOPEN (void)__c; -#endif // _LIBCPP_HAS_CATOPEN +# endif // _LIBCPP_HAS_CATOPEN } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages; -#endif +# endif template class _LIBCPP_TEMPLATE_VIS messages_byname : public messages<_CharT> { @@ -3124,11 +3128,11 @@ protected: }; extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname; -#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS messages_byname; -#endif +# endif -#if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT) +# if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT) template ::wstring_convert( __cvtptr_ = new _Codecvt; } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG template inline wstring_convert<_Codecvt, _Elem, _WideAlloc, _ByteAlloc>::wstring_convert(wstring_convert&& __wc) @@ -3218,7 +3222,7 @@ inline wstring_convert<_Codecvt, _Elem, _WideAlloc, _ByteAlloc>::wstring_convert __wc.__cvtptr_ = nullptr; } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG _LIBCPP_SUPPRESS_DEPRECATED_PUSH template @@ -3372,14 +3376,14 @@ private: bool __always_noconv_; public: -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI wbuffer_convert() : wbuffer_convert(nullptr) {} explicit _LIBCPP_HIDE_FROM_ABI wbuffer_convert(streambuf* __bytebuf, _Codecvt* __pcvt = new _Codecvt, state_type __state = state_type()); -# else +# else _LIBCPP_EXPLICIT_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI wbuffer_convert(streambuf* __bytebuf = nullptr, _Codecvt* __pcvt = new _Codecvt, state_type __state = state_type()); -# endif +# endif _LIBCPP_HIDE_FROM_ABI ~wbuffer_convert(); @@ -3735,7 +3739,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>* wbuffer_convert<_Codecvt, _Elem, _Tr>::__ _LIBCPP_SUPPRESS_DEPRECATED_POP -#endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT) +# endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT) _LIBCPP_END_NAMESPACE_STD @@ -3743,6 +3747,8 @@ _LIBCPP_POP_MACROS // NOLINTEND(libcpp-robust-against-adl) +#endif // !defined(_LIBCPP_HAS_NO_LOCALIZATION) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/locale.h b/libcxx/include/locale.h index 20910fa2f97e0c..425bf47d437ac8 100644 --- a/libcxx/include/locale.h +++ b/libcxx/include/locale.h @@ -35,10 +35,6 @@ #include <__config> -#if defined(_LIBCPP_HAS_NO_LOCALIZATION) -# error " is not supported since libc++ has been configured without support for localization." -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan index 8d443f4acd1ddd..aa7ba278b1aa06 100644 --- a/libcxx/include/mdspan +++ b/libcxx/include/mdspan @@ -20,6 +20,10 @@ namespace std { template using dextents = see below; + // [mdspan.extents.dims], alias template dims + template + using dims = see below; // since C++26 + // [mdspan.layout], layout mapping struct layout_left; struct layout_right; diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 4ad506781c489a..f4aaa14c1c2ee6 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1962,8 +1962,6 @@ module std_private_type_traits_is_fundamental [system module std_private_type_traits_is_implicitly_default_constructible [system] { header "__type_traits/is_implicitly_default_constructible.h" } module std_private_type_traits_is_integral [system] { header "__type_traits/is_integral.h" } module std_private_type_traits_is_literal_type [system] { header "__type_traits/is_literal_type.h" } -module std_private_type_traits_is_member_function_pointer [system] { header "__type_traits/is_member_function_pointer.h" } -module std_private_type_traits_is_member_object_pointer [system] { header "__type_traits/is_member_object_pointer.h" } module std_private_type_traits_is_member_pointer [system] { header "__type_traits/is_member_pointer.h" } module std_private_type_traits_is_nothrow_assignable [system] { header "__type_traits/is_nothrow_assignable.h" } module std_private_type_traits_is_nothrow_constructible [system] { diff --git a/libcxx/include/regex b/libcxx/include/regex index 17ad0cf5b2aea7..b8141351213212 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -4214,7 +4214,7 @@ public: _LIBCPP_HIDE_FROM_ABI int compare(const value_type* __s) const { return str().compare(__s); } _LIBCPP_HIDE_FROM_ABI void swap(sub_match& __s) _NOEXCEPT_(__is_nothrow_swappable_v<_BidirectionalIterator>) { - this->template pair<_BidirectionalIterator, _BidirectionalIterator>::swap(__s); + this->pair<_BidirectionalIterator, _BidirectionalIterator>::swap(__s); std::swap(matched, __s.matched); } }; diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore index 8d3b04475c092d..95a4375f21c175 100644 --- a/libcxx/include/semaphore +++ b/libcxx/include/semaphore @@ -47,30 +47,28 @@ using binary_semaphore = counting_semaphore<1>; #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif - -#include <__assert> -#include <__atomic/atomic_base.h> -#include <__atomic/atomic_sync.h> -#include <__atomic/memory_order.h> -#include <__chrono/time_point.h> -#include <__thread/poll_with_backoff.h> -#include <__thread/support.h> -#include <__thread/timed_backoff_policy.h> -#include -#include -#include - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) + +# include <__assert> +# include <__atomic/atomic_base.h> +# include <__atomic/atomic_sync.h> +# include <__atomic/memory_order.h> +# include <__chrono/time_point.h> +# include <__thread/poll_with_backoff.h> +# include <__thread/support.h> +# include <__thread/timed_backoff_policy.h> +# include +# include +# include + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> -#if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 _LIBCPP_BEGIN_NAMESPACE_STD @@ -82,7 +80,7 @@ functions. It avoids contention against users' own use of those facilities. */ -# define _LIBCPP_SEMAPHORE_MAX (numeric_limits::max()) +# define _LIBCPP_SEMAPHORE_MAX (numeric_limits::max()) class __atomic_semaphore_base { __atomic_base __a_; @@ -177,10 +175,12 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_STD_VER >= 14 +# endif // _LIBCPP_STD_VER >= 14 _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_THREADS) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include #endif diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex index 397ac290d9b2ef..f63bd25493878b 100644 --- a/libcxx/include/shared_mutex +++ b/libcxx/include/shared_mutex @@ -124,31 +124,29 @@ template #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif - -#include <__chrono/duration.h> -#include <__chrono/steady_clock.h> -#include <__chrono/time_point.h> -#include <__condition_variable/condition_variable.h> -#include <__memory/addressof.h> -#include <__mutex/mutex.h> -#include <__mutex/tag_types.h> -#include <__mutex/unique_lock.h> -#include <__system_error/system_error.h> -#include <__utility/swap.h> -#include -#include +#if !defined(_LIBCPP_HAS_NO_THREADS) + +# include <__chrono/duration.h> +# include <__chrono/steady_clock.h> +# include <__chrono/time_point.h> +# include <__condition_variable/condition_variable.h> +# include <__memory/addressof.h> +# include <__mutex/mutex.h> +# include <__mutex/tag_types.h> +# include <__mutex/unique_lock.h> +# include <__system_error/system_error.h> +# include <__utility/swap.h> +# include +# include _LIBCPP_PUSH_MACROS -#include <__undef_macros> +# include <__undef_macros> -#if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_BEGIN_NAMESPACE_STD @@ -181,7 +179,7 @@ struct _LIBCPP_EXPORTED_FROM_ABI __shared_mutex_base { // native_handle_type native_handle(); // See 30.2.3 }; -# if _LIBCPP_STD_VER >= 17 +# if _LIBCPP_STD_VER >= 17 class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_THREAD_SAFETY_ANNOTATION(__capability__("shared_mutex")) shared_mutex { __shared_mutex_base __base_; @@ -218,7 +216,7 @@ public: // typedef __shared_mutex_base::native_handle_type native_handle_type; // _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() { return __base::unlock_shared(); } }; -# endif +# endif class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_THREAD_SAFETY_ANNOTATION(__capability__("shared_timed_mutex")) shared_timed_mutex { @@ -453,10 +451,12 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(shared_lock<_Mutex>& __x, shared_lock<_Mu _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_STD_VER >= 14 +# endif // _LIBCPP_STD_VER >= 14 _LIBCPP_POP_MACROS +#endif // !defined(_LIBCPP_HAS_NO_THREADS) + #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include #endif diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token index fee195f9d63d4b..c9c54dfb5a755b 100644 --- a/libcxx/include/stop_token +++ b/libcxx/include/stop_token @@ -33,18 +33,18 @@ namespace std { #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) -#include <__stop_token/stop_callback.h> -#include <__stop_token/stop_source.h> -#include <__stop_token/stop_token.h> -#include +# include <__stop_token/stop_callback.h> +# include <__stop_token/stop_source.h> +# include <__stop_token/stop_token.h> +# include -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif + +#endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include diff --git a/libcxx/include/thread b/libcxx/include/thread index 68ce63bd0143df..25cb7ce6d7231e 100644 --- a/libcxx/include/thread +++ b/libcxx/include/thread @@ -88,25 +88,25 @@ void sleep_for(const chrono::duration& rel_time); #include <__config> -#ifdef _LIBCPP_HAS_NO_THREADS -# error " is not supported since libc++ has been configured without support for threads." -#endif +#if !defined(_LIBCPP_HAS_NO_THREADS) -#include <__thread/formatter.h> -#include <__thread/jthread.h> -#include <__thread/support.h> -#include <__thread/this_thread.h> -#include <__thread/thread.h> -#include +# include <__thread/formatter.h> +# include <__thread/jthread.h> +# include <__thread/support.h> +# include <__thread/this_thread.h> +# include <__thread/thread.h> +# include // standard-mandated includes // [thread.syn] -#include +# include -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif + +#endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) # include diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index aee9fcf4137f3c..a77ddadafb6810 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -467,8 +467,6 @@ namespace std #include <__type_traits/is_implicitly_default_constructible.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_literal_type.h> -#include <__type_traits/is_member_function_pointer.h> -#include <__type_traits/is_member_object_pointer.h> #include <__type_traits/is_member_pointer.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> diff --git a/libcxx/include/version b/libcxx/include/version index 1f66bce40df8a2..c971336bcb85ce 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -159,7 +159,8 @@ __cpp_lib_make_unique 201304L __cpp_lib_map_try_emplace 201411L __cpp_lib_math_constants 201907L __cpp_lib_math_special_functions 201603L -__cpp_lib_mdspan 202207L +__cpp_lib_mdspan 202406L + 202207L // C++23 __cpp_lib_memory_resource 201603L __cpp_lib_move_iterator_concept 202207L __cpp_lib_move_only_function 202110L @@ -530,6 +531,8 @@ __cpp_lib_void_t 201411L // # define __cpp_lib_is_virtual_base_of 202406L // # define __cpp_lib_is_within_lifetime 202306L // # define __cpp_lib_linalg 202311L +# undef __cpp_lib_mdspan +# define __cpp_lib_mdspan 202406L // # define __cpp_lib_optional_range_support 202406L # undef __cpp_lib_out_ptr // # define __cpp_lib_out_ptr 202311L diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc index 3ba3c46150c9cf..8d313f755f7253 100644 --- a/libcxx/modules/std/chrono.inc +++ b/libcxx/modules/std/chrono.inc @@ -37,7 +37,6 @@ export namespace std { // [time.duration.comparisons], duration comparisons using std::chrono::operator==; - using std::chrono::operator!=; using std::chrono::operator<; using std::chrono::operator>; using std::chrono::operator<=; diff --git a/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp new file mode 100644 index 00000000000000..e74bc0e66fca1c --- /dev/null +++ b/libcxx/test/std/containers/views/mdspan/extents/dims.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 + +// + +// template +// using dims = see below; +// +// Result: A type E that is a specialization of extents such that +// E::rank() == Rank && E::rank() == E::rank_dynamic() is true, +// and E::index_type denotes IndexType. + +#include +#include + +#include "test_macros.h" + +template +void test_alias_template_dims() { + constexpr size_t D = std::dynamic_extent; + ASSERT_SAME_TYPE(std::dims<0, IndexType>, std::extents); + ASSERT_SAME_TYPE(std::dims<1, IndexType>, std::extents); + ASSERT_SAME_TYPE(std::dims<2, IndexType>, std::extents); + ASSERT_SAME_TYPE(std::dims<3, IndexType>, std::extents); + ASSERT_SAME_TYPE(std::dims<9, IndexType>, std::extents); +} + +template <> +void test_alias_template_dims() { + constexpr size_t D = std::dynamic_extent; + ASSERT_SAME_TYPE(std::dims<0>, std::extents); + ASSERT_SAME_TYPE(std::dims<1>, std::extents); + ASSERT_SAME_TYPE(std::dims<2>, std::extents); + ASSERT_SAME_TYPE(std::dims<3>, std::extents); + ASSERT_SAME_TYPE(std::dims<9>, std::extents); +} + +int main(int, char**) { + test_alias_template_dims(); + test_alias_template_dims(); + test_alias_template_dims(); + return 0; +} diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp index 4ef33823064216..64d1c99b223f42 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp @@ -18,6 +18,7 @@ /* Constant Value __cpp_lib_freestanding_mdspan 202311L [C++26] __cpp_lib_mdspan 202207L [C++23] + 202406L [C++26] __cpp_lib_submdspan 202306L [C++26] */ @@ -115,8 +116,8 @@ # ifndef __cpp_lib_mdspan # error "__cpp_lib_mdspan should be defined in c++26" # endif -# if __cpp_lib_mdspan != 202207L -# error "__cpp_lib_mdspan should have the value 202207L in c++26" +# if __cpp_lib_mdspan != 202406L +# error "__cpp_lib_mdspan should have the value 202406L in c++26" # endif # if !defined(_LIBCPP_VERSION) diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index aa5ff80afb56a2..a01ee702a51723 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -147,6 +147,7 @@ __cpp_lib_math_constants 201907L [C++20] __cpp_lib_math_special_functions 201603L [C++17] __cpp_lib_mdspan 202207L [C++23] + 202406L [C++26] __cpp_lib_memory_resource 201603L [C++17] __cpp_lib_modules 202207L [C++23] __cpp_lib_move_iterator_concept 202207L [C++20] @@ -7289,8 +7290,8 @@ # ifndef __cpp_lib_mdspan # error "__cpp_lib_mdspan should be defined in c++26" # endif -# if __cpp_lib_mdspan != 202207L -# error "__cpp_lib_mdspan should have the value 202207L in c++26" +# if __cpp_lib_mdspan != 202406L +# error "__cpp_lib_mdspan should have the value 202406L in c++26" # endif # if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_PMR diff --git a/libcxx/test/std/ranges/range.factories/range.single.view/empty.pass.cpp b/libcxx/test/std/ranges/range.factories/range.single.view/empty.pass.cpp new file mode 100644 index 00000000000000..7e6ff015ea9a41 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.single.view/empty.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// static constexpr bool empty() noexcept; + +#include +#include +#include +#include + +#include "test_macros.h" + +struct Empty {}; +struct BigType { + char buffer[64] = {10}; +}; + +template +constexpr void test_empty(T value) { + using SingleView = std::ranges::single_view; + + { + std::same_as decltype(auto) result = SingleView::empty(); + assert(result == false); + static_assert(noexcept(SingleView::empty())); + } + + { + SingleView sv{value}; + + std::same_as decltype(auto) result = std::ranges::empty(sv); + assert(result == false); + static_assert(noexcept(std::ranges::empty(sv))); + } + { + const SingleView sv{value}; + + std::same_as decltype(auto) result = std::ranges::empty(sv); + assert(result == false); + static_assert(noexcept(std::ranges::empty(std::as_const(sv)))); + } +} + +constexpr bool test() { + test_empty(92); + test_empty(Empty{}); + test_empty(BigType{}); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/comparisons.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/comparisons.pass.cpp index 45d7e51fbe7182..e28b6d8609bc4d 100644 --- a/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/comparisons.pass.cpp +++ b/libcxx/test/std/time/time.cal/time.cal.ymdlast/time.cal.ymdlast.nonmembers/comparisons.pass.cpp @@ -5,19 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 // // class year_month_day_last; // constexpr bool operator==(const year_month_day_last& x, const year_month_day_last& y) noexcept; -// Returns: x.year() == y.year() && x.month_day_last() == y.month_day_last(). -// -// constexpr bool operator< (const year_month_day_last& x, const year_month_day_last& y) noexcept; -// Returns: -// If x.year() < y.year(), returns true. -// Otherwise, if x.year() > y.year(), returns false. -// Otherwise, returns x.month_day_last() < y.month_day_last() +// constexpr bool operator<=>(const year_month_day_last& x, const year_month_day_last& y) noexcept; #include #include @@ -26,63 +21,61 @@ #include "test_macros.h" #include "test_comparisons.h" -int main(int, char**) -{ - using year = std::chrono::year; - using month = std::chrono::month; - using month_day_last = std::chrono::month_day_last; - using year_month_day_last = std::chrono::year_month_day_last; - - AssertComparisonsAreNoexcept(); - AssertComparisonsReturnBool(); - - constexpr month January = std::chrono::January; - constexpr month February = std::chrono::February; - - static_assert( testComparisons( - year_month_day_last{year{1234}, month_day_last{January}}, - year_month_day_last{year{1234}, month_day_last{January}}, - true, false), ""); - - // different month - static_assert( testComparisons( - year_month_day_last{year{1234}, month_day_last{January}}, - year_month_day_last{year{1234}, month_day_last{February}}, - false, true), ""); - - // different year - static_assert( testComparisons( - year_month_day_last{year{1234}, month_day_last{January}}, - year_month_day_last{year{1235}, month_day_last{January}}, - false, true), ""); - - // different month - static_assert( testComparisons( - year_month_day_last{year{1234}, month_day_last{January}}, - year_month_day_last{year{1234}, month_day_last{February}}, - false, true), ""); - - // different year and month - static_assert( testComparisons( - year_month_day_last{year{1234}, month_day_last{February}}, - year_month_day_last{year{1235}, month_day_last{January}}, - false, true), ""); - - // same year, different months - for (unsigned i = 1; i < 12; ++i) - for (unsigned j = 1; j < 12; ++j) - assert((testComparisons( - year_month_day_last{year{1234}, month_day_last{month{i}}}, - year_month_day_last{year{1234}, month_day_last{month{j}}}, - i == j, i < j ))); - - // same month, different years - for (int i = 1000; i < 2000; ++i) - for (int j = 1000; j < 2000; ++j) - assert((testComparisons( - year_month_day_last{year{i}, month_day_last{January}}, - year_month_day_last{year{j}, month_day_last{January}}, - i == j, i < j ))); - - return 0; +constexpr bool test() { + using year = std::chrono::year; + using month = std::chrono::month; + using month_day_last = std::chrono::month_day_last; + using year_month_day_last = std::chrono::year_month_day_last; + + constexpr month January = std::chrono::January; + constexpr month February = std::chrono::February; + + assert(testOrder(year_month_day_last{year{1234}, month_day_last{January}}, + year_month_day_last{year{1234}, month_day_last{January}}, + std::strong_ordering::equal)); + + // different month + assert(testOrder(year_month_day_last{year{1234}, month_day_last{January}}, + year_month_day_last{year{1234}, month_day_last{February}}, + std::strong_ordering::less)); + + // different year + assert(testOrder(year_month_day_last{year{1234}, month_day_last{January}}, + year_month_day_last{year{1235}, month_day_last{January}}, + std::strong_ordering::less)); + + // different year and month + assert(testOrder(year_month_day_last{year{1234}, month_day_last{February}}, + year_month_day_last{year{1235}, month_day_last{January}}, + std::strong_ordering::less)); + + // same year, different months + for (unsigned i = 1; i < 12; ++i) + for (unsigned j = 1; j < 12; ++j) + assert((testOrder(year_month_day_last{year{1234}, month_day_last{month{i}}}, + year_month_day_last{year{1234}, month_day_last{month{j}}}, + i == j ? std::strong_ordering::equal + : i < j ? std::strong_ordering::less + : std::strong_ordering::greater))); + + // same month, different years + for (int i = 1000; i < 20; ++i) + for (int j = 1000; j < 20; ++j) + assert((testOrder(year_month_day_last{year{i}, month_day_last{January}}, + year_month_day_last{year{j}, month_day_last{January}}, + i == j ? std::strong_ordering::equal + : i < j ? std::strong_ordering::less + : std::strong_ordering::greater))); + return true; +} + +int main(int, char**) { + using year_month_day_last = std::chrono::year_month_day_last; + AssertOrderAreNoexcept(); + AssertOrderReturn(); + + test(); + static_assert(test()); + + return 0; } diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 7bc11aa401829d..773b1523cde4ec 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -868,7 +868,7 @@ def add_version_header(tc): "name": "__cpp_lib_mdspan", "values": { "c++23": 202207, - # "c++26": 202406, # P2389R2 dextents Index Type Parameter + "c++26": 202406, # P2389R2 dextents Index Type Parameter }, "headers": ["mdspan"], }, diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index cef6271e4c8f8a..9e28b1c50be504 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2428,9 +2428,12 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // file's symbol table. If any of those library functions are defined in a // bitcode file in an archive member, we need to arrange to use LTO to // compile those archive members by adding them to the link beforehand. - if (!ctx.bitcodeFileInstances.empty()) - for (auto *s : lto::LTO::getRuntimeLibcallSymbols()) + if (!ctx.bitcodeFileInstances.empty()) { + llvm::Triple TT( + ctx.bitcodeFileInstances.front()->obj->getTargetTriple()); + for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) ctx.symtab.addLibcall(s); + } // Windows specific -- if __load_config_used can be resolved, resolve it. if (ctx.symtab.findUnderscore("_load_config_used")) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index a94cb3a7cfef73..40e095a133d953 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -202,6 +202,7 @@ static std::tuple parseEmulation(StringRef emul) { .Case("elf64_amdgpu", {ELF64LEKind, EM_AMDGPU}) .Case("elf64loongarch", {ELF64LEKind, EM_LOONGARCH}) .Case("elf64_s390", {ELF64BEKind, EM_S390}) + .Case("hexagonelf", {ELF32LEKind, EM_HEXAGON}) .Default({ELFNoneKind, EM_NONE}); if (ret.first == ELFNoneKind) @@ -2883,9 +2884,11 @@ template void LinkerDriver::link(opt::InputArgList &args) { // to, i.e. if the symbol's definition is in bitcode. Any other required // libcall symbols will be added to the link after LTO when we add the LTO // object file to the link. - if (!ctx.bitcodeFiles.empty()) - for (auto *s : lto::LTO::getRuntimeLibcallSymbols()) + if (!ctx.bitcodeFiles.empty()) { + llvm::Triple TT(ctx.bitcodeFiles.front()->obj->getTargetTriple()); + for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) handleLibcall(s); + } // Archive members defining __wrap symbols may be extracted. std::vector wrapped = addWrappedSymbols(args); diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 41bd9a95053f7f..92ef9330141fca 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -452,6 +452,7 @@ static std::pair parseBfdName(StringRef s) { .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH}) .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH}) .Case("elf64-s390", {ELF64BEKind, EM_S390}) + .Cases("elf32-hexagon", "elf32-littlehexagon", {ELF32LEKind, EM_HEXAGON}) .Default({ELFNoneKind, EM_NONE}); } @@ -461,20 +462,28 @@ static std::pair parseBfdName(StringRef s) { void ScriptParser::readOutputFormat() { expect("("); - StringRef s; - config->bfdname = unquote(next()); + StringRef s = unquote(next()); if (!consume(")")) { expect(","); - s = unquote(next()); + StringRef tmp = unquote(next()); if (config->optEB) - config->bfdname = s; + s = tmp; expect(","); - s = unquote(next()); + tmp = unquote(next()); if (config->optEL) - config->bfdname = s; + s = tmp; consume(")"); } - s = config->bfdname; + // If more than one OUTPUT_FORMAT is specified, only the first is checked. + if (!config->bfdname.empty()) + return; + config->bfdname = s; + + if (s == "binary") { + config->oFormatBinary = true; + return; + } + if (s.consume_back("-freebsd")) config->osabi = ELFOSABI_FREEBSD; diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index c95170c1165b22..05179bfdcb5369 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -79,6 +79,8 @@ ELF Improvements * ``PROVIDE(lhs = rhs) PROVIDE(rhs = ...)``, ``lhs`` is now defined only if ``rhs`` is needed. (`#74771 `_) (`#87530 `_) +* ``OUTPUT_FORMAT(binary)`` is now supported. + (`#98837 `_) * Orphan placement is refined to prefer the last similar section when its rank <= orphan's rank. (`#94099 `_) Non-alloc orphan sections are now placed at the end. diff --git a/lld/test/COFF/lib.test b/lld/test/COFF/lib.test index 7525ef4226cda5..82abca6ec9307b 100644 --- a/lld/test/COFF/lib.test +++ b/lld/test/COFF/lib.test @@ -1,6 +1,14 @@ # RUN: lld-link /machine:x64 /def:%S/Inputs/library.def /out:%t.lib # RUN: llvm-nm %t.lib | FileCheck %s +CHECK: 00000000 R __imp_constant +CHECK: 00000000 R constant + +CHECK: 00000000 D __imp_data + +CHECK: 00000000 T __imp_function +CHECK: 00000000 T function + CHECK: 00000000 a @comp.id CHECK: 00000000 a @feat.00 CHECK: 00000000 W alias @@ -11,11 +19,3 @@ CHECK: 00000000 a @feat.00 CHECK: 00000000 W __imp_alias CHECK: U __imp_function -CHECK: 00000000 R __imp_constant -CHECK: 00000000 R constant - -CHECK: 00000000 D __imp_data - -CHECK: 00000000 T __imp_function -CHECK: 00000000 T function - diff --git a/lld/test/ELF/emulation-hexagon.s b/lld/test/ELF/emulation-hexagon.s new file mode 100644 index 00000000000000..a8a02d4c428b5c --- /dev/null +++ b/lld/test/ELF/emulation-hexagon.s @@ -0,0 +1,34 @@ +# REQUIRES: hexagon +# RUN: llvm-mc -filetype=obj -triple=hexagon %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-readelf --file-headers %t | FileCheck --check-prefix=CHECK %s +# RUN: ld.lld -m hexagonelf %t.o -o %t +# RUN: llvm-readelf --file-headers %t | FileCheck --check-prefix=CHECK %s + +# RUN: echo 'OUTPUT_FORMAT(elf32-littlehexagon)' > %t.script +# RUN: ld.lld %t.script %t.o -o %t +# RUN: llvm-readelf --file-headers %t | FileCheck --check-prefix=CHECK %s + +# RUN: echo 'OUTPUT_FORMAT(elf32-hexagon)' > %t.script +# RUN: ld.lld %t.script %t.o -o %t +# RUN: llvm-readelf --file-headers %t | FileCheck --check-prefix=CHECK %s + +# CHECK: ELF Header: +# CHECK-NEXT: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 +# CHECK-NEXT: Class: ELF32 +# CHECK-NEXT: Data: 2's complement, little endian +# CHECK-NEXT: Version: 1 (current) +# CHECK-NEXT: OS/ABI: UNIX - System V +# CHECK-NEXT: ABI Version: 0 +# CHECK-NEXT: Type: EXEC (Executable file) +# CHECK-NEXT: Machine: Qualcomm Hexagon +# CHECK-NEXT: Version: 0x1 +# CHECK-NEXT: Entry point address: 0x200B4 +# CHECK-NEXT: Start of program headers: 52 (bytes into file) +# CHECK-NEXT: Start of section headers: +# CHECK-NEXT: Flags: 0x60 +# CHECK-NEXT: Size of this header: 52 (bytes) +# CHECK-NEXT: Size of program headers: 32 (bytes) + +.globl _start +_start: diff --git a/lld/test/ELF/invalid-linkerscript.test b/lld/test/ELF/invalid-linkerscript.test index c8770bd6aa720f..4cbedf639cb1a3 100644 --- a/lld/test/ELF/invalid-linkerscript.test +++ b/lld/test/ELF/invalid-linkerscript.test @@ -58,3 +58,7 @@ # RUN: not ld.lld %t9 no-such-file 2>&1 | FileCheck -check-prefix=ERR9 %s # ERR9: , expected, but got y # ERR9: cannot open no-such-file: + +# RUN: echo 'OUTPUT_FORMAT("")' > %t10 +# RUN: not ld.lld %t10 2>&1 | FileCheck -check-prefix=ERR10 %s +# ERR10: error: {{.*}}:1: unknown output format name: diff --git a/lld/test/ELF/oformat-binary.s b/lld/test/ELF/oformat-binary.s index 38af6805140fce..a3780a68b24e13 100644 --- a/lld/test/ELF/oformat-binary.s +++ b/lld/test/ELF/oformat-binary.s @@ -6,8 +6,17 @@ # CHECK: 0000000 90 11 22 # CHECK-NEXT: 0000003 -## Check case when linkerscript is used. -# RUN: echo "SECTIONS { . = 0x1000; }" > %t.script +## OUTPUT_FORMAT(binary) selects the binary format as well. +# RUN: echo "OUTPUT_FORMAT(binary)" > %t.script +# RUN: ld.lld -o %t2.out -T %t.script %t +# RUN: od -t x1 -v %t2.out | FileCheck %s +## More OUTPUT_FORMAT commands are ignored. +# RUN: echo "OUTPUT_FORMAT("binary")OUTPUT_FORMAT(elf64-x86-64)" > %t.script +# RUN: ld.lld -o %t2.out -T %t.script %t +# RUN: od -t x1 -v %t2.out | FileCheck %s + +## --oformat=binary overrides an ELF OUTPUT_FORMAT. +# RUN: echo "OUTPUT_FORMAT(elf64-x86-64) SECTIONS { . = 0x1000; }" > %t.script # RUN: ld.lld -o %t2.out --script %t.script %t --oformat binary # RUN: od -t x1 -v %t2.out | FileCheck %s @@ -45,6 +54,10 @@ # RUN: | FileCheck %s --check-prefix ERR # ERR: unknown --oformat value: foo +# RUN: echo "OUTPUT_FORMAT(binary-freebsd)" > %t.script +# RUN: not ld.lld -T %t.script %t -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR2 +# ERR2: error: {{.*}}.script:1: unknown output format name: binary-freebsd + # RUN: ld.lld -o /dev/null %t --oformat elf # RUN: ld.lld -o /dev/null %t --oformat=elf-foo diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s index fdbda701dad123..5a1bd7f949f897 100644 --- a/lld/test/ELF/systemz-gotent-relax-und-dso.s +++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s @@ -14,9 +14,9 @@ # DISASM: Disassembly of section .text: # DISASM-EMPTY: # DISASM-NEXT: : -# DISASM-NEXT: bc 0, 0 +# DISASM-NEXT: nop 0 # DISASM: : -# DISASM-NEXT: bc 0, 0 +# DISASM-NEXT: nop 0 # DISASM: <_start>: # DISASM-NEXT: lgrl %r1, 0x2400 # DISASM-NEXT: lgrl %r1, 0x2400 diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s index 7ff82b9a190006..e84fd8d4653e9c 100644 --- a/lld/test/ELF/systemz-gotent-relax.s +++ b/lld/test/ELF/systemz-gotent-relax.s @@ -30,9 +30,9 @@ # DISASM: Disassembly of section .text: # DISASM: 00000000010011e0 : -# DISASM-NEXT: bc 0, 0 +# DISASM-NEXT: nop 0 # DISASM: 00000000010011e4 : -# DISASM-NEXT: bc 0, 0 +# DISASM-NEXT: nop 0 # DISASM: 00000000010011e8 : # DISASM-NEXT: br %r14 # DISASM: 00000000010011ea <_start>: diff --git a/lld/test/ELF/systemz-init-padding.s b/lld/test/ELF/systemz-init-padding.s index c56b98d43f1b0e..c7d9e33c22b1b2 100644 --- a/lld/test/ELF/systemz-init-padding.s +++ b/lld/test/ELF/systemz-init-padding.s @@ -12,7 +12,7 @@ # CHECK: <.init>: # CHECK-NEXT: brasl %r14, -# CHECK-NEXT: bcr 0, %r7 +# CHECK-NEXT: nopr %r7 # CHECK-NEXT: lg %r4, 272(%r15) .text diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s index 4669f01f588121..c7563cd18c2749 100644 --- a/lld/test/ELF/systemz-plt.s +++ b/lld/test/ELF/systemz-plt.s @@ -48,9 +48,9 @@ # DIS-NEXT: 100102c: d2 07 f0 30 10 08 mvc 48(8,%r15), 8(%r1) # DIS-NEXT: 1001032: e3 10 10 10 00 04 lg %r1, 16(%r1) # DIS-NEXT: 1001038: 07 f1 br %r1 -# DIS-NEXT: 100103a: 07 00 bcr 0, %r0 -# DIS-NEXT: 100103c: 07 00 bcr 0, %r0 -# DIS-NEXT: 100103e: 07 00 bcr 0, %r0 +# DIS-NEXT: 100103a: 07 00 nopr %r0 +# DIS-NEXT: 100103c: 07 00 nopr %r0 +# DIS-NEXT: 100103e: 07 00 nopr %r0 # DIS-NEXT: 1001040: c0 10 00 00 10 54 larl %r1, 0x10030e8 # DIS-NEXT: 1001046: e3 10 10 00 00 04 lg %r1, 0(%r1) # DIS-NEXT: 100104c: 07 f1 br %r1 diff --git a/lld/test/wasm/dylink.s b/lld/test/wasm/dylink.s index 27e8c3ea7a7c6c..ab604fc1adc18f 100644 --- a/lld/test/wasm/dylink.s +++ b/lld/test/wasm/dylink.s @@ -6,6 +6,16 @@ # RUN: wasm-ld --experimental-pic -pie -o %t.wasm %t.o %t.lib.so # RUN: obj2yaml %t.wasm | FileCheck %s +# Same again for wasm64 + +# RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten -o %t.o %s +# RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten %p/Inputs/ret32.s -o %t.ret32.o +# RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-emscripten %p/Inputs/libsearch-dyn.s -o %t.dyn.o +# RUN: wasm-ld --experimental-pic -mwasm64 -shared %t.ret32.o %t.dyn.o -o %t.lib.so +# RUN: not wasm-ld --experimental-pic -mwasm64 -pie -o %t.wasm %t.o 2>&1 | FileCheck --check-prefix=ERROR %s +# RUN: wasm-ld --experimental-pic -mwasm64 -pie -o %t.wasm %t.o %t.lib.so +# RUN: obj2yaml %t.wasm | FileCheck %s + # ERROR: error: {{.*}}: undefined symbol: ret32 # ERROR: error: {{.*}}: undefined symbol: _bar .functype ret32 (f32) -> (i32) diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index b66b988005d58d..8c83d17db02f54 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -1320,9 +1320,11 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // We only need to add libcall symbols to the link before LTO if the symbol's // definition is in bitcode. Any other required libcall symbols will be added // to the link after LTO when we add the LTO object file to the link. - if (!ctx.bitcodeFiles.empty()) - for (auto *s : lto::LTO::getRuntimeLibcallSymbols()) + if (!ctx.bitcodeFiles.empty()) { + llvm::Triple TT(ctx.bitcodeFiles.front()->obj->getTargetTriple()); + for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) handleLibcall(s); + } if (errorCount()) return; diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp index ae557740a18ba5..f3f0ef9a994973 100644 --- a/lld/wasm/InputFiles.cpp +++ b/lld/wasm/InputFiles.cpp @@ -408,6 +408,12 @@ ObjFile::ObjFile(MemoryBufferRef m, StringRef archiveName, bool lazy) this->lazy = lazy; this->archiveName = std::string(archiveName); + // Currently we only do this check for regular object file, and not for shared + // object files. This is because architecture detection for shared objects is + // currently based on a heuristic, which is fallable: + // https://github.com/llvm/llvm-project/issues/98778 + checkArch(wasmObj->getArch()); + // If this isn't part of an archive, it's eagerly linked, so mark it live. if (archiveName.empty()) markLive(); @@ -456,8 +462,6 @@ WasmFileBase::WasmFileBase(Kind k, MemoryBufferRef m) : InputFile(k, m) { bin.release(); wasmObj.reset(obj); - - checkArch(obj->getArch()); } void ObjFile::parse(bool ignoreComdats) { diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst index 3ce23beec2a5e1..d9b8e589eb2ac0 100644 --- a/lldb/docs/index.rst +++ b/lldb/docs/index.rst @@ -125,7 +125,6 @@ interesting areas to contribute to lldb. use/symbolication use/symbols use/remote - use/qemu-testing use/intel_pt use/ondemand use/aarch64-linux @@ -153,6 +152,7 @@ interesting areas to contribute to lldb. resources/contributing resources/build resources/test + resources/qemu-testing resources/debugging resources/fuzzing resources/sbapi diff --git a/lldb/docs/use/qemu-testing.rst b/lldb/docs/resources/qemu-testing.rst similarity index 100% rename from lldb/docs/use/qemu-testing.rst rename to lldb/docs/resources/qemu-testing.rst diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h index 65920c76df7a8d..bec816fb451844 100644 --- a/lldb/include/lldb/API/SBValue.h +++ b/lldb/include/lldb/API/SBValue.h @@ -89,6 +89,8 @@ class LLDB_API SBValue { lldb::SBValue GetNonSyntheticValue(); + lldb::SBValue GetSyntheticValue(); + lldb::DynamicValueType GetPreferDynamicValue(); void SetPreferDynamicValue(lldb::DynamicValueType use_dynamic); diff --git a/lldb/include/lldb/Core/EmulateInstruction.h b/lldb/include/lldb/Core/EmulateInstruction.h index 93c16537adba12..b459476883fc53 100644 --- a/lldb/include/lldb/Core/EmulateInstruction.h +++ b/lldb/include/lldb/Core/EmulateInstruction.h @@ -369,6 +369,8 @@ class EmulateInstruction : public PluginInterface { virtual bool ReadInstruction() = 0; + virtual std::optional GetLastInstrSize() { return std::nullopt; } + virtual bool EvaluateInstruction(uint32_t evaluate_options) = 0; virtual InstructionCondition GetInstructionCondition() { diff --git a/lldb/include/lldb/Core/Progress.h b/lldb/include/lldb/Core/Progress.h index cd87be79c4f0e3..421e435a9e685a 100644 --- a/lldb/include/lldb/Core/Progress.h +++ b/lldb/include/lldb/Core/Progress.h @@ -42,8 +42,8 @@ namespace lldb_private { /// uint64_t total, /// void *baton); /// -/// This callback will always initially be called with "completed" set to zero -/// and "total" set to the total amount specified in the contructor. This is +/// This callback will always initially be called with \a completed set to zero +/// and \a total set to the total amount specified in the constructor. This is /// considered the progress start event. As Progress::Increment() is called, /// the callback will be called as long as the Progress::m_completed has not /// yet exceeded the Progress::m_total. When the callback is called with @@ -52,7 +52,7 @@ namespace lldb_private { /// Progress::m_total, then this is considered a progress update event. /// /// This callback will be called in the destructor if Progress::m_completed is -/// not equal to Progress::m_total with the "completed" set to +/// not equal to Progress::m_total with the \a completed set to /// Progress::m_total. This ensures we always send a progress completed update /// even if the user does not. @@ -62,7 +62,7 @@ class Progress { /// /// The constructor will create a unique progress reporting object and /// immediately send out a progress update by calling the installed callback - /// with completed set to zero out of the specified total. + /// with \a completed set to zero out of the specified total. /// /// @param [in] title The title of this progress activity. /// @@ -86,11 +86,11 @@ class Progress { /// Destroy the progress object. /// /// If the progress has not yet sent a completion update, the destructor - /// will send out a notification where the completed == m_total. This ensures - /// that we always send out a progress complete notification. + /// will send out a notification where the \a completed == m_total. This + /// ensures that we always send out a progress complete notification. ~Progress(); - /// Increment the progress and send a notification to the intalled callback. + /// Increment the progress and send a notification to the installed callback. /// /// If incrementing ends up exceeding m_total, m_completed will be updated /// to match m_total and no subsequent progress notifications will be sent. diff --git a/lldb/include/lldb/Host/Config.h.cmake b/lldb/include/lldb/Host/Config.h.cmake index 9e538534086a2b..3defa454f6d420 100644 --- a/lldb/include/lldb/Host/Config.h.cmake +++ b/lldb/include/lldb/Host/Config.h.cmake @@ -33,8 +33,6 @@ #cmakedefine01 LLDB_ENABLE_LZMA -#cmakedefine01 LLVM_ENABLE_CURL - #cmakedefine01 LLDB_ENABLE_CURSES #cmakedefine01 CURSES_HAVE_NCURSES_CURSES_H diff --git a/lldb/include/lldb/Symbol/SymbolLocation.h b/lldb/include/lldb/Symbol/SymbolLocation.h new file mode 100644 index 00000000000000..be590c403b6e20 --- /dev/null +++ b/lldb/include/lldb/Symbol/SymbolLocation.h @@ -0,0 +1,32 @@ +//===-- SymbolLocation.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SYMBOL_SYMBOLLOCATION_H +#define LLDB_SYMBOL_SYMBOLLOCATION_H + +#include "lldb/Utility/ConstString.h" +#include "lldb/Utility/FileSpec.h" +#include "lldb/lldb-private.h" + +#include + +namespace lldb_private { + +/// Stores a function module spec, symbol name and possibly an alternate symbol +/// name. +struct SymbolLocation { + FileSpec module_spec; + std::vector symbols; + + // The symbols are regular expressions. In such case all symbols are matched + // with their trailing @VER symbol version stripped. + bool symbols_are_regex = false; +}; + +} // namespace lldb_private +#endif // LLDB_SYMBOL_SYMBOLLOCATION_H diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index ceaf547ebddaf9..c8475db8ae1609 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -465,6 +465,8 @@ class Process : public std::enable_shared_from_this, static bool SetUpdateStateOnRemoval(Event *event_ptr); private: + bool ForwardEventToPendingListeners(Event *event_ptr) override; + void SetUpdateStateOnRemoval() { m_update_state++; } void SetRestarted(bool new_value) { m_restarted = new_value; } diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h new file mode 100644 index 00000000000000..7e045760a28be6 --- /dev/null +++ b/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h @@ -0,0 +1,39 @@ +#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H + +#include "lldb/Target/StackFrameRecognizer.h" + +namespace lldb_private { + +void RegisterVerboseTrapFrameRecognizer(Process &process); + +/// Holds the stack frame that caused the Verbose trap and the inlined stop +/// reason message. +class VerboseTrapRecognizedStackFrame : public RecognizedStackFrame { +public: + VerboseTrapRecognizedStackFrame(lldb::StackFrameSP most_relevant_frame_sp, + std::string stop_desc); + + lldb::StackFrameSP GetMostRelevantFrame() override; + +private: + lldb::StackFrameSP m_most_relevant_frame; +}; + +/// When a thread stops, it checks the current frame contains a +/// Verbose Trap diagnostic. If so, it returns a \a +/// VerboseTrapRecognizedStackFrame holding the diagnostic a stop reason +/// description with and the parent frame as the most relavant frame. +class VerboseTrapFrameRecognizer : public StackFrameRecognizer { +public: + std::string GetName() override { + return "Verbose Trap StackFrame Recognizer"; + } + + lldb::RecognizedStackFrameSP + RecognizeFrame(lldb::StackFrameSP frame) override; +}; + +} // namespace lldb_private + +#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H diff --git a/lldb/include/lldb/Utility/Event.h b/lldb/include/lldb/Utility/Event.h index 461d711b8c3f2c..4f58f257d4a261 100644 --- a/lldb/include/lldb/Utility/Event.h +++ b/lldb/include/lldb/Utility/Event.h @@ -48,6 +48,17 @@ class EventData { virtual void Dump(Stream *s) const; private: + /// This will be queried for a Broadcaster with a primary and some secondary + /// listeners after the primary listener pulled the event from the event queue + /// and ran its DoOnRemoval, right before the event is delivered. + /// If it returns true, the event will also be forwarded to the secondary + /// listeners, and if false, event propagation stops at the primary listener. + /// Some broadcasters (particularly the Process broadcaster) fetch events on + /// a private Listener, and then forward the event to the Public Listeners + /// after some processing. The Process broadcaster does not want to forward + /// to the secondary listeners at the private processing stage. + virtual bool ForwardEventToPendingListeners(Event *event_ptr) { return true; } + virtual void DoOnRemoval(Event *event_ptr) {} EventData(const EventData &) = delete; diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 0e8ca159efd55d..ecc7b81035f11f 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -1053,10 +1053,6 @@ def _get_bool_config_skip_if_decorator(key): return unittest.skipIf(not have, "requires " + key) -def skipIfCurlSupportMissing(func): - return _get_bool_config_skip_if_decorator("curl")(func) - - def skipIfCursesSupportMissing(func): return _get_bool_config_skip_if_decorator("curses")(func) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index d1a2de8b2478a4..3d562285ce9cc0 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -51,7 +51,7 @@ LLDB_BASE_DIR := $(THIS_FILE_DIR)/../../../../../ # # GNUWin32 uname gives "windows32" or "server version windows32" while # some versions of MSYS uname return "MSYS_NT*", but most environments -# standardize on "Windows_NT", so we'll make it consistent here. +# standardize on "Windows_NT", so we'll make it consistent here. # When running tests from Visual Studio, the environment variable isn't # inherited all the way down to the process spawned for make. #---------------------------------------------------------------------- @@ -213,12 +213,6 @@ else ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES" DSYM = $(EXE).debug endif - - ifeq "$(MAKE_DWP)" "YES" - MAKE_DWO := YES - DWP_NAME = $(EXE).dwp - DYLIB_DWP_NAME = $(DYLIB_NAME).dwp - endif endif LIMIT_DEBUG_INFO_FLAGS = @@ -367,17 +361,6 @@ ifneq "$(OS)" "Darwin" OBJCOPY ?= $(call replace_cc_with,objcopy) ARCHIVER ?= $(call replace_cc_with,ar) - # Look for llvm-dwp or gnu dwp - DWP ?= $(call replace_cc_with,llvm-dwp) - ifeq ($(wildcard $(DWP)),) - DWP = $(call replace_cc_with,dwp) - ifeq ($(wildcard $(DWP)),) - DWP = $(shell command -v llvm-dwp 2> /dev/null) - ifeq ($(wildcard $(DWP)),) - DWP = $(shell command -v dwp 2> /dev/null) - endif - endif - endif override AR = $(ARCHIVER) endif @@ -548,10 +531,6 @@ ifneq "$(CXX)" "" endif endif -ifeq "$(GEN_GNU_BUILD_ID)" "YES" - LDFLAGS += -Wl,--build-id -endif - #---------------------------------------------------------------------- # DYLIB_ONLY variable can be used to skip the building of a.out. # See the sections below regarding dSYM file as well as the building of @@ -590,18 +569,11 @@ else endif else ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES" -ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES" - cp "$(EXE)" "$(EXE).unstripped" -endif $(OBJCOPY) --only-keep-debug "$(EXE)" "$(DSYM)" $(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DSYM)" "$(EXE)" "$(EXE)" endif -ifeq "$(MAKE_DWP)" "YES" - $(DWP) -o "$(DWP_NAME)" $(DWOS) -endif endif - #---------------------------------------------------------------------- # Make the dylib #---------------------------------------------------------------------- @@ -642,15 +614,9 @@ endif else $(LD) $(DYLIB_OBJECTS) $(LDFLAGS) -shared -o "$(DYLIB_FILENAME)" ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES" -ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES" - cp "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).unstripped" -endif $(OBJCOPY) --only-keep-debug "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).debug" $(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DYLIB_FILENAME).debug" "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME)" endif -ifeq "$(MAKE_DWP)" "YES" - $(DWP) -o $(DYLIB_DWP_FILE) $(DYLIB_DWOS) -endif endif #---------------------------------------------------------------------- diff --git a/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h new file mode 100644 index 00000000000000..026e7183ab27a0 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h @@ -0,0 +1,58 @@ +#ifndef STD_LLDB_COMPRESSED_PAIR_H +#define STD_LLDB_COMPRESSED_PAIR_H + +#include +#include // for std::forward + +namespace std { +namespace __lldb { + +// Post-c88580c layout +struct __value_init_tag {}; +struct __default_init_tag {}; + +template ::value && !std::is_final<_Tp>::value> +struct __compressed_pair_elem { + explicit __compressed_pair_elem(__default_init_tag) {} + explicit __compressed_pair_elem(__value_init_tag) : __value_() {} + + explicit __compressed_pair_elem(_Tp __t) : __value_(__t) {} + + _Tp &__get() { return __value_; } + +private: + _Tp __value_; +}; + +template +struct __compressed_pair_elem<_Tp, _Idx, true> : private _Tp { + explicit __compressed_pair_elem(_Tp __t) : _Tp(__t) {} + explicit __compressed_pair_elem(__default_init_tag) {} + explicit __compressed_pair_elem(__value_init_tag) : _Tp() {} + + _Tp &__get() { return *this; } +}; + +template +class __compressed_pair : private __compressed_pair_elem<_T1, 0>, + private __compressed_pair_elem<_T2, 1> { +public: + using _Base1 = __compressed_pair_elem<_T1, 0>; + using _Base2 = __compressed_pair_elem<_T2, 1>; + + explicit __compressed_pair(_T1 __t1, _T2 __t2) : _Base1(__t1), _Base2(__t2) {} + explicit __compressed_pair() + : _Base1(__value_init_tag()), _Base2(__value_init_tag()) {} + + template + explicit __compressed_pair(_U1 &&__t1, _U2 &&__t2) + : _Base1(std::forward<_U1>(__t1)), _Base2(std::forward<_U2>(__t2)) {} + + _T1 &first() { return static_cast<_Base1 &>(*this).__get(); } +}; +} // namespace __lldb +} // namespace std + +#endif // _H diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index fb035a36e7d745..29da7d33dd80b8 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -775,9 +775,6 @@ SBStructuredData SBDebugger::GetBuildConfiguration() { AddBoolConfigEntry( *config_up, "xml", XMLDocument::XMLEnabled(), "A boolean value that indicates if XML support is enabled in LLDB"); - AddBoolConfigEntry( - *config_up, "curl", LLVM_ENABLE_CURL, - "A boolean value that indicates if CURL support is enabled in LLDB"); AddBoolConfigEntry( *config_up, "curses", LLDB_ENABLE_CURSES, "A boolean value that indicates if curses support is enabled in LLDB"); @@ -1727,20 +1724,20 @@ SBDebugger::LoadTraceFromFile(SBError &error, void SBDebugger::RequestInterrupt() { LLDB_INSTRUMENT_VA(this); - + if (m_opaque_sp) - m_opaque_sp->RequestInterrupt(); + m_opaque_sp->RequestInterrupt(); } void SBDebugger::CancelInterruptRequest() { LLDB_INSTRUMENT_VA(this); - + if (m_opaque_sp) - m_opaque_sp->CancelInterruptRequest(); + m_opaque_sp->CancelInterruptRequest(); } bool SBDebugger::InterruptRequested() { LLDB_INSTRUMENT_VA(this); - + if (m_opaque_sp) return m_opaque_sp->InterruptRequested(); return false; diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index 10a691c4034195..96670481eca3fc 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -764,6 +764,21 @@ lldb::SBValue SBValue::GetNonSyntheticValue() { return value_sb; } +lldb::SBValue SBValue::GetSyntheticValue() { + LLDB_INSTRUMENT_VA(this); + + SBValue value_sb; + if (IsValid()) { + ValueImplSP proxy_sp(new ValueImpl(m_opaque_sp->GetRootSP(), + m_opaque_sp->GetUseDynamic(), true)); + value_sb.SetSP(proxy_sp); + if (!value_sb.IsSynthetic()) { + return {}; + } + } + return value_sb; +} + lldb::DynamicValueType SBValue::GetPreferDynamicValue() { LLDB_INSTRUMENT_VA(this); diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 3587a8f529e4ab..8685d5761557ba 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -950,11 +950,13 @@ class CommandObjectProcessLoad : public CommandObjectParsed { ExecutionContext *execution_context) override { Status error; const int short_option = m_getopt_table[option_idx].val; + ArchSpec arch = + execution_context->GetProcessPtr()->GetSystemArchitecture(); switch (short_option) { case 'i': do_install = true; if (!option_arg.empty()) - install_path.SetFile(option_arg, FileSpec::Style::native); + install_path.SetFile(option_arg, arch.GetTriple()); break; default: llvm_unreachable("Unimplemented option"); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 80181a9b3cb716..d594330934ad7b 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -4252,7 +4252,7 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed { m_option_group.Append(&m_current_stack_option, LLDB_OPT_SET_2, LLDB_OPT_SET_2); m_option_group.Finalize(); - AddSimpleArgumentList(eArgTypeShlibName); + AddSimpleArgumentList(eArgTypeFilename); } ~CommandObjectTargetSymbolsAdd() override = default; diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp index 6c46618b337c23..e8014b1eeb3789 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp @@ -622,11 +622,26 @@ std::optional EmulateInstructionRISCV::Decode(uint32_t inst) { Log *log = GetLog(LLDBLog::Unwind); uint16_t try_rvc = uint16_t(inst & 0x0000ffff); - // check whether the compressed encode could be valid - uint16_t mask = try_rvc & 0b11; - bool is_rvc = try_rvc != 0 && mask != 3; uint8_t inst_type = RV64; + // Try to get size of RISCV instruction. + // 1.2 Instruction Length Encoding + bool is_16b = (inst & 0b11) != 0b11; + bool is_32b = (inst & 0x1f) != 0x1f; + bool is_48b = (inst & 0x3f) != 0x1f; + bool is_64b = (inst & 0x7f) != 0x3f; + if (is_16b) + m_last_size = 2; + else if (is_32b) + m_last_size = 4; + else if (is_48b) + m_last_size = 6; + else if (is_64b) + m_last_size = 8; + else + // Not Valid + m_last_size = std::nullopt; + // if we have ArchSpec::eCore_riscv128 in the future, // we also need to check it here if (m_arch.GetCore() == ArchSpec::eCore_riscv32) @@ -638,8 +653,8 @@ std::optional EmulateInstructionRISCV::Decode(uint32_t inst) { LLDB_LOGF( log, "EmulateInstructionRISCV::%s: inst(%x at %" PRIx64 ") was decoded to %s", __FUNCTION__, inst, m_addr, pat.name); - auto decoded = is_rvc ? pat.decode(try_rvc) : pat.decode(inst); - return DecodeResult{decoded, inst, is_rvc, pat}; + auto decoded = is_16b ? pat.decode(try_rvc) : pat.decode(inst); + return DecodeResult{decoded, inst, is_16b, pat}; } } LLDB_LOGF(log, "EmulateInstructionRISCV::%s: inst(0x%x) was unsupported", diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h index 8bca73a7f589df..53ac11c2e1102a 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h @@ -60,6 +60,7 @@ class EmulateInstructionRISCV : public EmulateInstruction { bool SetTargetTriple(const ArchSpec &arch) override; bool ReadInstruction() override; + std::optional GetLastInstrSize() override { return m_last_size; } bool EvaluateInstruction(uint32_t options) override; bool TestEmulation(Stream &out_stream, ArchSpec &arch, OptionValueDictionary *test_data) override; @@ -99,6 +100,8 @@ class EmulateInstructionRISCV : public EmulateInstruction { private: /// Last decoded instruction from m_opcode DecodeResult m_decoded; + /// Last decoded instruction size estimate. + std::optional m_last_size; }; } // namespace lldb_private diff --git a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp index 6bf8a0dc28b22e..ef71a964eaf206 100644 --- a/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp +++ b/lldb/source/Plugins/Process/Utility/NativeProcessSoftwareSingleStep.cpp @@ -94,6 +94,38 @@ static lldb::addr_t ReadFlags(NativeRegisterContext ®siter_context) { LLDB_INVALID_ADDRESS); } +static int GetSoftwareBreakpointSize(const ArchSpec &arch, + lldb::addr_t next_flags) { + if (arch.GetMachine() == llvm::Triple::arm) { + if (next_flags & 0x20) + // Thumb mode + return 2; + // Arm mode + return 4; + } + if (arch.IsMIPS() || arch.GetTriple().isPPC64() || + arch.GetTriple().isRISCV() || arch.GetTriple().isLoongArch()) + return 4; + return 0; +} + +static Status SetSoftwareBreakpointOnPC(const ArchSpec &arch, lldb::addr_t pc, + lldb::addr_t next_flags, + NativeProcessProtocol &process) { + int size_hint = GetSoftwareBreakpointSize(arch, next_flags); + Status error; + error = process.SetBreakpoint(pc, size_hint, /*hardware=*/false); + + // If setting the breakpoint fails because pc is out of the address + // space, ignore it and let the debugee segfault. + if (error.GetError() == EIO || error.GetError() == EFAULT) + return Status(); + if (error.Fail()) + return error; + + return Status(); +} + Status NativeProcessSoftwareSingleStep::SetupSoftwareSingleStepping( NativeThreadProtocol &thread) { Status error; @@ -115,8 +147,23 @@ Status NativeProcessSoftwareSingleStep::SetupSoftwareSingleStepping( emulator_up->SetWriteMemCallback(&WriteMemoryCallback); emulator_up->SetWriteRegCallback(&WriteRegisterCallback); - if (!emulator_up->ReadInstruction()) - return Status("Read instruction failed!"); + if (!emulator_up->ReadInstruction()) { + // try to get at least the size of next instruction to set breakpoint. + auto instr_size = emulator_up->GetLastInstrSize(); + if (!instr_size) + return Status("Read instruction failed!"); + bool success = false; + auto pc = emulator_up->ReadRegisterUnsigned(eRegisterKindGeneric, + LLDB_REGNUM_GENERIC_PC, + LLDB_INVALID_ADDRESS, &success); + if (!success) + return Status("Reading pc failed!"); + lldb::addr_t next_pc = pc + *instr_size; + auto result = + SetSoftwareBreakpointOnPC(arch, next_pc, /* next_flags */ 0x0, process); + m_threads_stepping_with_breakpoint.insert({thread.GetID(), next_pc}); + return result; + } bool emulation_result = emulator_up->EvaluateInstruction(eEmulateInstructionOptionAutoAdvancePC); @@ -157,29 +204,7 @@ Status NativeProcessSoftwareSingleStep::SetupSoftwareSingleStepping( // modifying the PC but we don't know how. return Status("Instruction emulation failed unexpectedly."); } - - int size_hint = 0; - if (arch.GetMachine() == llvm::Triple::arm) { - if (next_flags & 0x20) { - // Thumb mode - size_hint = 2; - } else { - // Arm mode - size_hint = 4; - } - } else if (arch.IsMIPS() || arch.GetTriple().isPPC64() || - arch.GetTriple().isRISCV() || arch.GetTriple().isLoongArch()) - size_hint = 4; - error = process.SetBreakpoint(next_pc, size_hint, /*hardware=*/false); - - // If setting the breakpoint fails because next_pc is out of the address - // space, ignore it and let the debugee segfault. - if (error.GetError() == EIO || error.GetError() == EFAULT) { - return Status(); - } else if (error.Fail()) - return error; - + auto result = SetSoftwareBreakpointOnPC(arch, next_pc, next_flags, process); m_threads_stepping_with_breakpoint.insert({thread.GetID(), next_pc}); - - return Status(); + return result; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 8e297141f4e132..85c59a605c675c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -23,7 +23,6 @@ #include "Plugins/ExpressionParser/Clang/ClangUtil.h" #include "Plugins/Language/ObjC/ObjCLanguage.h" #include "lldb/Core/Module.h" -#include "lldb/Core/Progress.h" #include "lldb/Core/Value.h" #include "lldb/Host/Host.h" #include "lldb/Symbol/CompileUnit.h" @@ -824,6 +823,34 @@ DWARFASTParserClang::GetDIEClassTemplateParams(const DWARFDIE &die) { return {}; } +void DWARFASTParserClang::MapDeclDIEToDefDIE( + const lldb_private::plugin::dwarf::DWARFDIE &decl_die, + const lldb_private::plugin::dwarf::DWARFDIE &def_die) { + LinkDeclContextToDIE(GetCachedClangDeclContextForDIE(decl_die), def_die); + SymbolFileDWARF *dwarf = def_die.GetDWARF(); + ParsedDWARFTypeAttributes decl_attrs(decl_die); + ParsedDWARFTypeAttributes def_attrs(def_die); + ConstString unique_typename(decl_attrs.name); + Declaration decl_declaration(decl_attrs.decl); + GetUniqueTypeNameAndDeclaration( + decl_die, SymbolFileDWARF::GetLanguage(*decl_die.GetCU()), + unique_typename, decl_declaration); + if (UniqueDWARFASTType *unique_ast_entry_type = + dwarf->GetUniqueDWARFASTTypeMap().Find( + unique_typename, decl_die, decl_declaration, + decl_attrs.byte_size.value_or(0), + decl_attrs.is_forward_declaration)) { + unique_ast_entry_type->UpdateToDefDIE(def_die, def_attrs.decl, + def_attrs.byte_size.value_or(0)); + } else if (Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups)) { + const dw_tag_t tag = decl_die.Tag(); + LLDB_LOG(log, + "Failed to find {0:x16} {1} ({2}) type \"{3}\" in " + "UniqueDWARFASTTypeMap", + decl_die.GetID(), DW_TAG_value_to_name(tag), tag, unique_typename); + } +} + TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc, const DWARFDIE &decl_die, ParsedDWARFTypeAttributes &attrs) { @@ -1546,13 +1573,17 @@ TypeSP DWARFASTParserClang::UpdateSymbolContextScopeForType( return type_sp; } -std::string -DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) { - if (!die.IsValid()) - return ""; - const char *name = die.GetName(); - if (!name) - return ""; +void DWARFASTParserClang::GetUniqueTypeNameAndDeclaration( + const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb::LanguageType language, lldb_private::ConstString &unique_typename, + lldb_private::Declaration &decl_declaration) { + // For C++, we rely solely upon the one definition rule that says + // only one thing can exist at a given decl context. We ignore the + // file and line that things are declared on. + if (!die.IsValid() || !Language::LanguageIsCPlusPlus(language) || + unique_typename.IsEmpty()) + return; + decl_declaration.Clear(); std::string qualified_name; DWARFDIE parent_decl_ctx_die = die.GetParentDeclContextDIE(); // TODO: change this to get the correct decl context parent.... @@ -1595,49 +1626,65 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) { if (qualified_name.empty()) qualified_name.append("::"); - qualified_name.append(name); + qualified_name.append(unique_typename.GetCString()); qualified_name.append(GetDIEClassTemplateParams(die)); - return qualified_name; + unique_typename = ConstString(qualified_name); } TypeSP DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, - const DWARFDIE &decl_die, + const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs) { CompilerType clang_type; - const dw_tag_t tag = decl_die.Tag(); - SymbolFileDWARF *dwarf = decl_die.GetDWARF(); - LanguageType cu_language = SymbolFileDWARF::GetLanguage(*decl_die.GetCU()); + const dw_tag_t tag = die.Tag(); + SymbolFileDWARF *dwarf = die.GetDWARF(); + LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU()); Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); - // UniqueDWARFASTType is large, so don't create a local variables on the - // stack, put it on the heap. This function is often called recursively and - // clang isn't good at sharing the stack space for variables in different - // blocks. - auto unique_ast_entry_up = std::make_unique(); - ConstString unique_typename(attrs.name); Declaration unique_decl(attrs.decl); + uint64_t byte_size = attrs.byte_size.value_or(0); + if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name && + !die.HasChildren() && cu_language == eLanguageTypeObjC) { + // Work around an issue with clang at the moment where forward + // declarations for objective C classes are emitted as: + // DW_TAG_structure_type [2] + // DW_AT_name( "ForwardObjcClass" ) + // DW_AT_byte_size( 0x00 ) + // DW_AT_decl_file( "..." ) + // DW_AT_decl_line( 1 ) + // + // Note that there is no DW_AT_declaration and there are no children, + // and the byte size is zero. + attrs.is_forward_declaration = true; + } if (attrs.name) { - if (Language::LanguageIsCPlusPlus(cu_language)) { - // For C++, we rely solely upon the one definition rule that says - // only one thing can exist at a given decl context. We ignore the - // file and line that things are declared on. - std::string qualified_name = GetCPlusPlusQualifiedName(decl_die); - if (!qualified_name.empty()) - unique_typename = ConstString(qualified_name); - unique_decl.Clear(); - } - - if (dwarf->GetUniqueDWARFASTTypeMap().Find( - unique_typename, decl_die, unique_decl, - attrs.byte_size.value_or(-1), *unique_ast_entry_up)) { - if (TypeSP type_sp = unique_ast_entry_up->m_type_sp) { + GetUniqueTypeNameAndDeclaration(die, cu_language, unique_typename, + unique_decl); + if (UniqueDWARFASTType *unique_ast_entry_type = + dwarf->GetUniqueDWARFASTTypeMap().Find( + unique_typename, die, unique_decl, byte_size, + attrs.is_forward_declaration)) { + if (TypeSP type_sp = unique_ast_entry_type->m_type_sp) { + dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); LinkDeclContextToDIE( - GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), - decl_die); + GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die); + // If the DIE being parsed in this function is a definition and the + // entry in the map is a declaration, then we need to update the entry + // to point to the definition DIE. + if (!attrs.is_forward_declaration && + unique_ast_entry_type->m_is_forward_declaration) { + unique_ast_entry_type->UpdateToDefDIE(die, unique_decl, byte_size); + clang_type = type_sp->GetForwardCompilerType(); + + CompilerType compiler_type_no_qualifiers = + ClangUtil::RemoveFastQualifiers(clang_type); + dwarf->GetForwardDeclCompilerTypeToDIE().insert_or_assign( + compiler_type_no_qualifiers.GetOpaqueQualType(), + *die.GetDIERef()); + } return type_sp; } } @@ -1659,128 +1706,56 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, default_accessibility = eAccessPrivate; } - if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name && - !decl_die.HasChildren() && cu_language == eLanguageTypeObjC) { - // Work around an issue with clang at the moment where forward - // declarations for objective C classes are emitted as: - // DW_TAG_structure_type [2] - // DW_AT_name( "ForwardObjcClass" ) - // DW_AT_byte_size( 0x00 ) - // DW_AT_decl_file( "..." ) - // DW_AT_decl_line( 1 ) - // - // Note that there is no DW_AT_declaration and there are no children, - // and the byte size is zero. - attrs.is_forward_declaration = true; - } + if ((attrs.class_language == eLanguageTypeObjC || + attrs.class_language == eLanguageTypeObjC_plus_plus) && + !attrs.is_complete_objc_class && + die.Supports_DW_AT_APPLE_objc_complete_type()) { + // We have a valid eSymbolTypeObjCClass class symbol whose name + // matches the current objective C class that we are trying to find + // and this DIE isn't the complete definition (we checked + // is_complete_objc_class above and know it is false), so the real + // definition is in here somewhere + TypeSP type_sp = + dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true); - if (attrs.class_language == eLanguageTypeObjC || - attrs.class_language == eLanguageTypeObjC_plus_plus) { - if (!attrs.is_complete_objc_class && - decl_die.Supports_DW_AT_APPLE_objc_complete_type()) { - // We have a valid eSymbolTypeObjCClass class symbol whose name - // matches the current objective C class that we are trying to find - // and this DIE isn't the complete definition (we checked - // is_complete_objc_class above and know it is false), so the real - // definition is in here somewhere - TypeSP type_sp = - dwarf->FindCompleteObjCDefinitionTypeForDIE(decl_die, attrs.name, true); - - if (!type_sp) { - SymbolFileDWARFDebugMap *debug_map_symfile = - dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, - // see if we have a declaration anywhere else... - type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( - decl_die, attrs.name, true); - } + if (!type_sp) { + SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, + // see if we have a declaration anywhere else... + type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( + die, attrs.name, true); } + } - if (type_sp) { - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " - "incomplete objc type, complete type is {5:x8}", - static_cast(this), decl_die.GetOffset(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), - type_sp->GetID()); - } - return type_sp; + if (type_sp) { + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " + "incomplete objc type, complete type is {5:x8}", + static_cast(this), die.GetID(), DW_TAG_value_to_name(tag), + tag, attrs.name.GetCString(), type_sp->GetID()); } + return type_sp; } } - DWARFDIE def_die; if (attrs.is_forward_declaration) { - Progress progress(llvm::formatv( - "Parsing type in {0}: '{1}'", - dwarf->GetObjectFile()->GetFileSpec().GetFilename().GetString(), - attrs.name.GetString())); - - // We have a forward declaration to a type and we need to try and - // find a full declaration. We look in the current type index just in - // case we have a forward declaration followed by an actual - // declarations in the DWARF. If this fails, we need to look - // elsewhere... - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " - "forward declaration, trying to find complete type", - static_cast(this), decl_die.GetID(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString()); - } - // See if the type comes from a Clang module and if so, track down // that type. - if (TypeSP type_sp = ParseTypeFromClangModule(sc, decl_die, log)) + TypeSP type_sp = ParseTypeFromClangModule(sc, die, log); + if (type_sp) return type_sp; - - def_die = dwarf->FindDefinitionDIE(decl_die); - - if (!def_die) { - SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, see - // if we have a declaration anywhere else... - def_die = debug_map_symfile->FindDefinitionDIE(decl_die); - } - } - - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " - "forward declaration, complete type is {5}", - static_cast(this), def_die.GetID(), DW_TAG_value_to_name(tag), - tag, attrs.name.GetCString(), - def_die ? llvm::utohexstr(def_die.GetID()) : "not found"); - } } - if (def_die) { - if (auto [it, inserted] = dwarf->GetDIEToType().try_emplace( - def_die.GetDIE(), DIE_IS_BEING_PARSED); - !inserted) { - if (it->getSecond() == nullptr || it->getSecond() == DIE_IS_BEING_PARSED) - return nullptr; - return it->getSecond()->shared_from_this(); - } - attrs = ParsedDWARFTypeAttributes(def_die); - } else { - // No definition found. Proceed with the declaration die. We can use it to - // create a forward-declared type. - def_die = decl_die; - } assert(tag_decl_kind != -1); UNUSED_IF_ASSERT_DISABLED(tag_decl_kind); - bool clang_type_was_created = false; - clang::DeclContext *containing_decl_ctx = GetClangDeclContextContainingDIE(def_die, nullptr); + clang::DeclContext *containing_decl_ctx = + GetClangDeclContextContainingDIE(die, nullptr); PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), - containing_decl_ctx, def_die, + containing_decl_ctx, die, attrs.name.GetCString()); if (attrs.accessibility == eAccessNone && containing_decl_ctx) { @@ -1793,50 +1768,47 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, } ClangASTMetadata metadata; - metadata.SetUserID(def_die.GetID()); - metadata.SetIsDynamicCXXType(dwarf->ClassOrStructIsVirtual(def_die)); + metadata.SetUserID(die.GetID()); + metadata.SetIsDynamicCXXType(dwarf->ClassOrStructIsVirtual(die)); TypeSystemClang::TemplateParameterInfos template_param_infos; - if (ParseTemplateParameterInfos(def_die, template_param_infos)) { + if (ParseTemplateParameterInfos(die, template_param_infos)) { clang::ClassTemplateDecl *class_template_decl = m_ast.ParseClassTemplateDecl( - containing_decl_ctx, GetOwningClangModule(def_die), - attrs.accessibility, attrs.name.GetCString(), tag_decl_kind, - template_param_infos); + containing_decl_ctx, GetOwningClangModule(die), attrs.accessibility, + attrs.name.GetCString(), tag_decl_kind, template_param_infos); if (!class_template_decl) { if (log) { dwarf->GetObjectFile()->GetModule()->LogMessage( log, "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" " "clang::ClassTemplateDecl failed to return a decl.", - static_cast(this), def_die.GetID(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString()); + static_cast(this), die.GetID(), DW_TAG_value_to_name(tag), + tag, attrs.name.GetCString()); } return TypeSP(); } clang::ClassTemplateSpecializationDecl *class_specialization_decl = m_ast.CreateClassTemplateSpecializationDecl( - containing_decl_ctx, GetOwningClangModule(def_die), - class_template_decl, tag_decl_kind, template_param_infos); + containing_decl_ctx, GetOwningClangModule(die), class_template_decl, + tag_decl_kind, template_param_infos); clang_type = m_ast.CreateClassTemplateSpecializationType(class_specialization_decl); - clang_type_was_created = true; m_ast.SetMetadata(class_template_decl, metadata); m_ast.SetMetadata(class_specialization_decl, metadata); } - if (!clang_type_was_created) { - clang_type_was_created = true; + if (!clang_type) { clang_type = m_ast.CreateRecordType( - containing_decl_ctx, GetOwningClangModule(def_die), attrs.accessibility, + containing_decl_ctx, GetOwningClangModule(die), attrs.accessibility, attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata, attrs.exports_symbols); } TypeSP type_sp = dwarf->MakeType( - def_die.GetID(), attrs.name, attrs.byte_size, nullptr, LLDB_INVALID_UID, + die.GetID(), attrs.name, attrs.byte_size, nullptr, LLDB_INVALID_UID, Type::eEncodingIsUID, &attrs.decl, clang_type, Type::ResolveState::Forward, TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class)); @@ -1846,39 +1818,38 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, // function prototypes. clang::DeclContext *type_decl_ctx = TypeSystemClang::GetDeclContextForType(clang_type); - LinkDeclContextToDIE(type_decl_ctx, decl_die); - if (decl_die != def_die) { - LinkDeclContextToDIE(type_decl_ctx, def_die); - dwarf->GetDIEToType()[def_die.GetDIE()] = type_sp.get(); - // Declaration DIE is inserted into the type map in ParseTypeFromDWARF - } + LinkDeclContextToDIE(type_decl_ctx, die); + // UniqueDWARFASTType is large, so don't create a local variables on the + // stack, put it on the heap. This function is often called recursively and + // clang isn't good at sharing the stack space for variables in different + // blocks. + auto unique_ast_entry_up = std::make_unique(); // Add our type to the unique type map so we don't end up creating many // copies of the same type over and over in the ASTContext for our // module unique_ast_entry_up->m_type_sp = type_sp; - unique_ast_entry_up->m_die = def_die; + unique_ast_entry_up->m_die = die; unique_ast_entry_up->m_declaration = unique_decl; - unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0); + unique_ast_entry_up->m_byte_size = byte_size; + unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration; dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename, *unique_ast_entry_up); - if (clang_type_was_created) { - // Leave this as a forward declaration until we need to know the - // details of the type. lldb_private::Type will automatically call - // the SymbolFile virtual function - // "SymbolFileDWARF::CompleteType(Type *)" When the definition - // needs to be defined. - bool inserted = - dwarf->GetForwardDeclCompilerTypeToDIE() - .try_emplace( - ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), - *def_die.GetDIERef()) - .second; - assert(inserted && "Type already in the forward declaration map!"); - (void)inserted; - m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); - } + // Leave this as a forward declaration until we need to know the + // details of the type. lldb_private::Type will automatically call + // the SymbolFile virtual function + // "SymbolFileDWARF::CompleteType(Type *)" When the definition + // needs to be defined. + bool inserted = + dwarf->GetForwardDeclCompilerTypeToDIE() + .try_emplace( + ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), + *die.GetDIERef()) + .second; + assert(inserted && "Type already in the forward declaration map!"); + (void)inserted; + m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); // If we made a clang type, set the trivial abi if applicable: We only // do this for pass by value - which implies the Trivial ABI. There diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 7b5ddbaa2a6b52..4b0ae026bce7e9 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -109,6 +109,9 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { std::string GetDIEClassTemplateParams( const lldb_private::plugin::dwarf::DWARFDIE &die) override; + void MapDeclDIEToDefDIE(const lldb_private::plugin::dwarf::DWARFDIE &decl_die, + const lldb_private::plugin::dwarf::DWARFDIE &def_die); + protected: /// Protected typedefs and members. /// @{ @@ -168,8 +171,10 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { lldb_private::TypeSystemClang::TemplateParameterInfos &template_param_infos); - std::string - GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die); + void GetUniqueTypeNameAndDeclaration( + const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb::LanguageType language, lldb_private::ConstString &unique_typename, + lldb_private::Declaration &decl_declaration); bool ParseChildMembers( const lldb_private::plugin::dwarf::DWARFDIE &die, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index e09c9a86478bb7..7cd3a33c7de575 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -481,6 +481,13 @@ static ConstString GetDWARFMachOSegmentName() { return g_dwarf_section_name; } +llvm::DenseMap & +SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE() { + if (SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile()) + return debug_map_symfile->GetForwardDeclCompilerTypeToDIE(); + return m_forward_decl_compiler_type_to_die; +} + UniqueDWARFASTTypeMap &SymbolFileDWARF::GetUniqueDWARFASTTypeMap() { SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile(); if (debug_map_symfile) @@ -1631,27 +1638,45 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) { return true; } - DWARFDIE dwarf_die = GetDIE(die_it->getSecond()); - if (dwarf_die) { - // Once we start resolving this type, remove it from the forward - // declaration map in case anyone child members or other types require this - // type to get resolved. The type will get resolved when all of the calls - // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done. - GetForwardDeclCompilerTypeToDIE().erase(die_it); - - Type *type = GetDIEToType().lookup(dwarf_die.GetDIE()); + DWARFDIE decl_die = GetDIE(die_it->getSecond()); + // Once we start resolving this type, remove it from the forward + // declaration map in case anyone's child members or other types require this + // type to get resolved. + GetForwardDeclCompilerTypeToDIE().erase(die_it); + DWARFDIE def_die = FindDefinitionDIE(decl_die); + if (!def_die) { + SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, see + // if we have a declaration anywhere else... + def_die = debug_map_symfile->FindDefinitionDIE(decl_die); + } + } + if (!def_die) { + // If we don't have definition DIE, CompleteTypeFromDWARF will forcefully + // complete this type. + def_die = decl_die; + } - Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion); - if (log) - GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( - log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", - dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()), - dwarf_die.Tag(), type->GetName().AsCString()); - assert(compiler_type); - if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) - return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type); + DWARFASTParser *dwarf_ast = GetDWARFParser(*def_die.GetCU()); + if (!dwarf_ast) + return false; + Type *type = GetDIEToType().lookup(decl_die.GetDIE()); + if (decl_die != def_die) { + GetDIEToType()[def_die.GetDIE()] = type; + DWARFASTParserClang *ast_parser = + static_cast(dwarf_ast); + ast_parser->MapDeclDIEToDefDIE(decl_die, def_die); } - return false; + + Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion); + if (log) + GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( + log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", + def_die.GetID(), DW_TAG_value_to_name(def_die.Tag()), def_die.Tag(), + type->GetName().AsCString()); + assert(compiler_type); + return dwarf_ast->CompleteTypeFromDWARF(def_die, type, compiler_type); } Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die, @@ -3047,8 +3072,15 @@ TypeSP SymbolFileDWARF::FindCompleteObjCDefinitionTypeForDIE( DWARFDIE SymbolFileDWARF::FindDefinitionDIE(const DWARFDIE &die) { - if (!die.GetName()) + const char *name = die.GetName(); + if (!name) return {}; + if (!die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0)) + return die; + + Progress progress(llvm::formatv( + "Searching definition DIE in {0}: '{1}'", + GetObjectFile()->GetFileSpec().GetFilename().GetString(), name)); const dw_tag_t tag = die.Tag(); @@ -3058,7 +3090,7 @@ SymbolFileDWARF::FindDefinitionDIE(const DWARFDIE &die) { log, "SymbolFileDWARF::FindDefinitionDIE(tag={0} " "({1}), name='{2}')", - DW_TAG_value_to_name(tag), tag, die.GetName()); + DW_TAG_value_to_name(tag), tag, name); } // Get the type system that we are looking to find a type for. We will @@ -3132,7 +3164,7 @@ SymbolFileDWARF::FindDefinitionDIE(const DWARFDIE &die) { log, "SymbolFileDWARF::FindDefinitionDIE(tag={0} ({1}), " "name='{2}') ignoring die={3:x16} ({4})", - DW_TAG_value_to_name(tag), tag, die.GetName(), type_die.GetOffset(), + DW_TAG_value_to_name(tag), tag, name, type_die.GetOffset(), type_die.GetName()); } return true; @@ -3144,7 +3176,7 @@ SymbolFileDWARF::FindDefinitionDIE(const DWARFDIE &die) { log, "SymbolFileDWARF::FindDefinitionTypeDIE(tag={0} ({1}), name='{2}') " "trying die={3:x16} ({4})", - DW_TAG_value_to_name(tag), tag, die.GetName(), type_die.GetOffset(), + DW_TAG_value_to_name(tag), tag, name, type_die.GetOffset(), type_dwarf_decl_ctx.GetQualifiedName()); } @@ -4299,38 +4331,26 @@ const std::shared_ptr &SymbolFileDWARF::GetDwpSymbolFile() { FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths(); ModuleSpec module_spec; module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec(); - FileSpec dwp_filespec; for (const auto &symfile : symfiles.files()) { module_spec.GetSymbolFileSpec() = FileSpec(symfile.GetPath() + ".dwp", symfile.GetPathStyle()); LLDB_LOG(log, "Searching for DWP using: \"{0}\"", module_spec.GetSymbolFileSpec()); - dwp_filespec = + FileSpec dwp_filespec = PluginManager::LocateExecutableSymbolFile(module_spec, search_paths); if (FileSystem::Instance().Exists(dwp_filespec)) { - break; - } - } - if (!FileSystem::Instance().Exists(dwp_filespec)) { - LLDB_LOG(log, "No DWP file found locally"); - // Fill in the UUID for the module we're trying to match for, so we can - // find the correct DWP file, as the Debuginfod plugin uses *only* this - // data to correctly match the DWP file with the binary. - module_spec.GetUUID() = m_objfile_sp->GetUUID(); - dwp_filespec = - PluginManager::LocateExecutableSymbolFile(module_spec, search_paths); - } - if (FileSystem::Instance().Exists(dwp_filespec)) { - LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec); - DataBufferSP dwp_file_data_sp; - lldb::offset_t dwp_file_data_offset = 0; - ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin( - GetObjectFile()->GetModule(), &dwp_filespec, 0, - FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp, - dwp_file_data_offset); - if (dwp_obj_file) { - m_dwp_symfile = std::make_shared( - *this, dwp_obj_file, DIERef::k_file_index_mask); + LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec); + DataBufferSP dwp_file_data_sp; + lldb::offset_t dwp_file_data_offset = 0; + ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin( + GetObjectFile()->GetModule(), &dwp_filespec, 0, + FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp, + dwp_file_data_offset); + if (dwp_obj_file) { + m_dwp_symfile = std::make_shared( + *this, dwp_obj_file, DIERef::k_file_index_mask); + break; + } } } if (!m_dwp_symfile) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h index 8469248872a44f..4967b37d753a09 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h @@ -342,12 +342,8 @@ class SymbolFileDWARF : public SymbolFileCommon { virtual DIEToTypePtr &GetDIEToType() { return m_die_to_type; } - typedef llvm::DenseMap - CompilerTypeToDIE; - - virtual CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() { - return m_forward_decl_compiler_type_to_die; - } + virtual llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE(); typedef llvm::DenseMap DIEToVariableSP; @@ -537,9 +533,14 @@ class SymbolFileDWARF : public SymbolFileCommon { NameToOffsetMap m_function_scope_qualified_name_map; std::unique_ptr m_ranges; UniqueDWARFASTTypeMap m_unique_ast_type_map; + // A map from DIE to lldb_private::Type. For record type, the key might be + // either declaration DIE or definition DIE. DIEToTypePtr m_die_to_type; DIEToVariableSP m_die_to_variable_sp; - CompilerTypeToDIE m_forward_decl_compiler_type_to_die; + // A map from CompilerType to the struct/class/union/enum DIE (might be a + // declaration or a definition) that is used to construct it. + llvm::DenseMap + m_forward_decl_compiler_type_to_die; llvm::DenseMap> m_type_unit_support_files; std::vector m_lldb_cu_to_dwarf_unit; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h index 7d5516b92737b9..34cb52e5b601c4 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h @@ -284,6 +284,11 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon { lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE( const DWARFDIE &die, ConstString type_name, bool must_be_implementation); + llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE() { + return m_forward_decl_compiler_type_to_die; + } + UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() { return m_unique_ast_type_map; } @@ -321,6 +326,10 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon { std::vector m_func_indexes; // Sorted by address std::vector m_glob_indexes; std::map>, OSOInfoSP> m_oso_map; + // A map from CompilerType to the struct/class/union/enum DIE (might be a + // declaration or a definition) that is used to construct it. + llvm::DenseMap + m_forward_decl_compiler_type_to_die; UniqueDWARFASTTypeMap m_unique_ast_type_map; LazyBool m_supports_DW_AT_APPLE_objc_complete_type; DebugMap m_debug_map; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp index 4a8c532a0d2a5e..49632e1d8911cb 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp @@ -110,7 +110,7 @@ SymbolFileDWARF::DIEToVariableSP &SymbolFileDWARFDwo::GetDIEToVariable() { return GetBaseSymbolFile().GetDIEToVariable(); } -SymbolFileDWARF::CompilerTypeToDIE & +llvm::DenseMap & SymbolFileDWARFDwo::GetForwardDeclCompilerTypeToDIE() { return GetBaseSymbolFile().GetForwardDeclCompilerTypeToDIE(); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h index 3bd0a2d25a5a6a..15c28fefd81f9d 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h @@ -74,7 +74,8 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF { DIEToVariableSP &GetDIEToVariable() override; - CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() override; + llvm::DenseMap & + GetForwardDeclCompilerTypeToDIE() override; UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp index 223518f0ae8241..3d201e96f92c3c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp @@ -13,66 +13,75 @@ using namespace lldb_private::dwarf; using namespace lldb_private::plugin::dwarf; -bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die, - const lldb_private::Declaration &decl, - const int32_t byte_size, - UniqueDWARFASTType &entry) const { - for (const UniqueDWARFASTType &udt : m_collection) { - // Make sure the tags match - if (udt.m_die.Tag() == die.Tag()) { - // Validate byte sizes of both types only if both are valid. - if (udt.m_byte_size < 0 || byte_size < 0 || - udt.m_byte_size == byte_size) { - // Make sure the file and line match - if (udt.m_declaration == decl) { - // The type has the same name, and was defined on the same file and - // line. Now verify all of the parent DIEs match. - DWARFDIE parent_arg_die = die.GetParent(); - DWARFDIE parent_pos_die = udt.m_die.GetParent(); - bool match = true; - bool done = false; - while (!done && match && parent_arg_die && parent_pos_die) { - const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); - const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); - if (parent_arg_tag == parent_pos_tag) { - switch (parent_arg_tag) { - case DW_TAG_class_type: - case DW_TAG_structure_type: - case DW_TAG_union_type: - case DW_TAG_namespace: { - const char *parent_arg_die_name = parent_arg_die.GetName(); - if (parent_arg_die_name == - nullptr) // Anonymous (i.e. no-name) struct - { - match = false; - } else { - const char *parent_pos_die_name = parent_pos_die.GetName(); - if (parent_pos_die_name == nullptr || - ((parent_arg_die_name != parent_pos_die_name) && - strcmp(parent_arg_die_name, parent_pos_die_name))) - match = false; - } - } break; +static bool IsStructOrClassTag(llvm::dwarf::Tag Tag) { + return Tag == llvm::dwarf::Tag::DW_TAG_class_type || + Tag == llvm::dwarf::Tag::DW_TAG_structure_type; +} - case DW_TAG_compile_unit: - case DW_TAG_partial_unit: - done = true; - break; - default: - break; - } +UniqueDWARFASTType *UniqueDWARFASTTypeList::Find( + const DWARFDIE &die, const lldb_private::Declaration &decl, + const int32_t byte_size, bool is_forward_declaration) { + for (UniqueDWARFASTType &udt : m_collection) { + // Make sure the tags match + if (udt.m_die.Tag() == die.Tag() || (IsStructOrClassTag(udt.m_die.Tag()) && + IsStructOrClassTag(die.Tag()))) { + // If they are not both definition DIEs or both declaration DIEs, then + // don't check for byte size and declaration location, because declaration + // DIEs usually don't have those info. + bool matching_size_declaration = + udt.m_is_forward_declaration != is_forward_declaration + ? true + : (udt.m_byte_size < 0 || byte_size < 0 || + udt.m_byte_size == byte_size) && + udt.m_declaration == decl; + if (!matching_size_declaration) + continue; + // The type has the same name, and was defined on the same file and + // line. Now verify all of the parent DIEs match. + DWARFDIE parent_arg_die = die.GetParent(); + DWARFDIE parent_pos_die = udt.m_die.GetParent(); + bool match = true; + bool done = false; + while (!done && match && parent_arg_die && parent_pos_die) { + const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); + const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); + if (parent_arg_tag == parent_pos_tag || + (IsStructOrClassTag(parent_arg_tag) && + IsStructOrClassTag(parent_pos_tag))) { + switch (parent_arg_tag) { + case DW_TAG_class_type: + case DW_TAG_structure_type: + case DW_TAG_union_type: + case DW_TAG_namespace: { + const char *parent_arg_die_name = parent_arg_die.GetName(); + if (parent_arg_die_name == nullptr) { + // Anonymous (i.e. no-name) struct + match = false; + } else { + const char *parent_pos_die_name = parent_pos_die.GetName(); + if (parent_pos_die_name == nullptr || + ((parent_arg_die_name != parent_pos_die_name) && + strcmp(parent_arg_die_name, parent_pos_die_name))) + match = false; } - parent_arg_die = parent_arg_die.GetParent(); - parent_pos_die = parent_pos_die.GetParent(); - } + } break; - if (match) { - entry = udt; - return true; + case DW_TAG_compile_unit: + case DW_TAG_partial_unit: + done = true; + break; + default: + break; } } + parent_arg_die = parent_arg_die.GetParent(); + parent_pos_die = parent_pos_die.GetParent(); + } + + if (match) { + return &udt; } } } - return false; + return nullptr; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h index bf3cbae55e5c7b..9215484fa2ea22 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h @@ -15,6 +15,7 @@ #include "DWARFDIE.h" #include "lldb/Core/Declaration.h" +#include "lldb/Symbol/Type.h" namespace lldb_private::plugin { namespace dwarf { @@ -23,31 +24,34 @@ class UniqueDWARFASTType { // Constructors and Destructors UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {} - UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die, - const Declaration &decl, int32_t byte_size) - : m_type_sp(type_sp), m_die(die), m_declaration(decl), - m_byte_size(byte_size) {} - UniqueDWARFASTType(const UniqueDWARFASTType &rhs) : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die), - m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {} + m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size), + m_is_forward_declaration(rhs.m_is_forward_declaration) {} ~UniqueDWARFASTType() = default; - UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) { - if (this != &rhs) { - m_type_sp = rhs.m_type_sp; - m_die = rhs.m_die; - m_declaration = rhs.m_declaration; - m_byte_size = rhs.m_byte_size; - } - return *this; + // This UniqueDWARFASTType might be created from declaration, update its info + // to definition DIE. + void UpdateToDefDIE(const DWARFDIE &def_die, Declaration &declaration, + int32_t byte_size) { + // Need to update Type ID to refer to the definition DIE, because + // it's used in DWARFASTParserClang::ParseCXXMethod to determine if we need + // to copy cxx method types from a declaration DIE to this definition DIE. + m_type_sp->SetID(def_die.GetID()); + if (declaration.IsValid()) + m_declaration = declaration; + if (byte_size) + m_byte_size = byte_size; + m_is_forward_declaration = false; } lldb::TypeSP m_type_sp; DWARFDIE m_die; Declaration m_declaration; int32_t m_byte_size = -1; + // True if the m_die is a forward declaration DIE. + bool m_is_forward_declaration = true; }; class UniqueDWARFASTTypeList { @@ -62,8 +66,9 @@ class UniqueDWARFASTTypeList { m_collection.push_back(entry); } - bool Find(const DWARFDIE &die, const Declaration &decl, - const int32_t byte_size, UniqueDWARFASTType &entry) const; + UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl, + const int32_t byte_size, + bool is_forward_declaration); protected: typedef std::vector collection; @@ -80,14 +85,15 @@ class UniqueDWARFASTTypeMap { m_collection[name.GetCString()].Append(entry); } - bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl, - const int32_t byte_size, UniqueDWARFASTType &entry) const { + UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die, + const Declaration &decl, const int32_t byte_size, + bool is_forward_declaration) { const char *unique_name_cstr = name.GetCString(); - collection::const_iterator pos = m_collection.find(unique_name_cstr); + collection::iterator pos = m_collection.find(unique_name_cstr); if (pos != m_collection.end()) { - return pos->second.Find(die, decl, byte_size, entry); + return pos->second.Find(die, decl, byte_size, is_forward_declaration); } - return false; + return nullptr; } protected: diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt index 3367022639ab85..ca969626f4ffc4 100644 --- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt +++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt @@ -1,10 +1,5 @@ -# Order matters here: the first symbol locator prevents further searching. -# For DWARF binaries that are both stripped and split, the Default plugin -# will return the stripped binary when asked for the ObjectFile, which then -# prevents an unstripped binary from being requested from the Debuginfod -# provider. -add_subdirectory(Debuginfod) add_subdirectory(Default) if (CMAKE_SYSTEM_NAME MATCHES "Darwin") add_subdirectory(DebugSymbols) endif() +add_subdirectory(Debuginfod) diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp index f296e655cc466d..b5fe35d71032a8 100644 --- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp +++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp @@ -44,24 +44,6 @@ llvm::StringRef SymbolVendorELF::GetPluginDescriptionStatic() { "executables."; } -// If this is needed elsewhere, it can be exported/moved. -static bool IsDwpSymbolFile(const lldb::ModuleSP &module_sp, - const FileSpec &file_spec) { - DataBufferSP dwp_file_data_sp; - lldb::offset_t dwp_file_data_offset = 0; - // Try to create an ObjectFile from the file_spec. - ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin( - module_sp, &file_spec, 0, FileSystem::Instance().GetByteSize(file_spec), - dwp_file_data_sp, dwp_file_data_offset); - // The presence of a debug_cu_index section is the key identifying feature of - // a DWP file. Make sure we don't fill in the section list on dwp_obj_file - // (by calling GetSectionList(false)) as this function could be called before - // we may have all the symbol files collected and available. - return dwp_obj_file && ObjectFileELF::classof(dwp_obj_file.get()) && - dwp_obj_file->GetSectionList(false)->FindSectionByType( - eSectionTypeDWARFDebugCuIndex, false); -} - // CreateInstance // // Platforms can register a callback to use when creating symbol vendors to @@ -105,15 +87,8 @@ SymbolVendorELF::CreateInstance(const lldb::ModuleSP &module_sp, FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths(); FileSpec dsym_fspec = PluginManager::LocateExecutableSymbolFile(module_spec, search_paths); - if (!dsym_fspec || IsDwpSymbolFile(module_sp, dsym_fspec)) { - // If we have a stripped binary or if we got a DWP file, we should prefer - // symbols in the executable acquired through a plugin. - ModuleSpec unstripped_spec = - PluginManager::LocateExecutableObjectFile(module_spec); - if (!unstripped_spec) - return nullptr; - dsym_fspec = unstripped_spec.GetFileSpec(); - } + if (!dsym_fspec) + return nullptr; DataBufferSP dsym_file_data_sp; lldb::offset_t dsym_file_data_offset = 0; diff --git a/lldb/source/Target/AssertFrameRecognizer.cpp b/lldb/source/Target/AssertFrameRecognizer.cpp index 5f4682bd5c11a5..da7c102645c014 100644 --- a/lldb/source/Target/AssertFrameRecognizer.cpp +++ b/lldb/source/Target/AssertFrameRecognizer.cpp @@ -2,6 +2,7 @@ #include "lldb/Core/Module.h" #include "lldb/Symbol/Function.h" #include "lldb/Symbol/SymbolContext.h" +#include "lldb/Symbol/SymbolLocation.h" #include "lldb/Target/Process.h" #include "lldb/Target/StackFrameList.h" #include "lldb/Target/Target.h" @@ -13,18 +14,6 @@ using namespace lldb; using namespace lldb_private; namespace lldb_private { - -/// Stores a function module spec, symbol name and possibly an alternate symbol -/// name. -struct SymbolLocation { - FileSpec module_spec; - std::vector symbols; - - // The symbols are regular expressions. In such case all symbols are matched - // with their trailing @VER symbol version stripped. - bool symbols_are_regex = false; -}; - /// Fetches the abort frame location depending on the current platform. /// /// \param[in] os diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index cf4818eae3eb8b..8186ccbea27d42 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -78,6 +78,7 @@ add_lldb_library(lldbTarget UnixSignals.cpp UnwindAssembly.cpp UnwindLLDB.cpp + VerboseTrapFrameRecognizer.cpp LINK_LIBS lldbBreakpoint diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index b91446e1c0e495..d990f8e714e229 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -63,6 +63,7 @@ #include "lldb/Target/ThreadPlanCallFunction.h" #include "lldb/Target/ThreadPlanStack.h" #include "lldb/Target/UnixSignals.h" +#include "lldb/Target/VerboseTrapFrameRecognizer.h" #include "lldb/Utility/AddressableBits.h" #include "lldb/Utility/Event.h" #include "lldb/Utility/LLDBLog.h" @@ -522,7 +523,11 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, if (!value_sp->OptionWasSet() && platform_cache_line_size != 0) value_sp->SetValueAs(platform_cache_line_size); + // FIXME: Frame recognizer registration should not be done in Target. + // We should have a plugin do the registration instead, for example, a + // common C LanguageRuntime plugin. RegisterAssertFrameRecognizer(this); + RegisterVerboseTrapFrameRecognizer(*this); } Process::~Process() { @@ -4236,7 +4241,22 @@ bool Process::ProcessEventData::ShouldStop(Event *event_ptr, return still_should_stop; } +bool Process::ProcessEventData::ForwardEventToPendingListeners( + Event *event_ptr) { + // STDIO and the other async event notifications should always be forwarded. + if (event_ptr->GetType() != Process::eBroadcastBitStateChanged) + return true; + + // For state changed events, if the update state is zero, we are handling + // this on the private state thread. We should wait for the public event. + return m_update_state == 1; +} + void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) { + // We only have work to do for state changed events: + if (event_ptr->GetType() != Process::eBroadcastBitStateChanged) + return; + ProcessSP process_sp(m_process_wp.lock()); if (!process_sp) diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Target/VerboseTrapFrameRecognizer.cpp new file mode 100644 index 00000000000000..fe72c8aec570d3 --- /dev/null +++ b/lldb/source/Target/VerboseTrapFrameRecognizer.cpp @@ -0,0 +1,122 @@ +#include "lldb/Target/VerboseTrapFrameRecognizer.h" + +#include "lldb/Core/Module.h" +#include "lldb/Symbol/Function.h" +#include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/Process.h" +#include "lldb/Target/StackFrameRecognizer.h" +#include "lldb/Target/Target.h" + +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" + +#include "clang/CodeGen/ModuleBuilder.h" + +using namespace llvm; +using namespace lldb; +using namespace lldb_private; + +VerboseTrapRecognizedStackFrame::VerboseTrapRecognizedStackFrame( + StackFrameSP most_relevant_frame_sp, std::string stop_desc) + : m_most_relevant_frame(most_relevant_frame_sp) { + m_stop_desc = std::move(stop_desc); +} + +lldb::RecognizedStackFrameSP +VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) { + if (frame_sp->GetFrameIndex()) + return {}; + + ThreadSP thread_sp = frame_sp->GetThread(); + ProcessSP process_sp = thread_sp->GetProcess(); + + StackFrameSP most_relevant_frame_sp = thread_sp->GetStackFrameAtIndex(1); + + if (!most_relevant_frame_sp) { + Log *log = GetLog(LLDBLog::Unwind); + LLDB_LOG( + log, + "Failed to find most relevant frame: Hit unwinding bound (1 frame)!"); + return {}; + } + + SymbolContext sc = frame_sp->GetSymbolContext(eSymbolContextEverything); + + if (!sc.block) + return {}; + + // The runtime error is set as the function name in the inlined function info + // of frame #0 by the compiler + const InlineFunctionInfo *inline_info = nullptr; + Block *inline_block = sc.block->GetContainingInlinedBlock(); + + if (!inline_block) + return {}; + + inline_info = sc.block->GetInlinedFunctionInfo(); + + if (!inline_info) + return {}; + + auto func_name = inline_info->GetName().GetStringRef(); + if (func_name.empty()) + return {}; + + static auto trap_regex = + llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); + SmallVector matches; + std::string regex_err_msg; + if (!trap_regex.match(func_name, &matches, ®ex_err_msg)) { + LLDB_LOGF(GetLog(LLDBLog::Unwind), + "Failed to parse match trap regex for '%s': %s", func_name.data(), + regex_err_msg.c_str()); + + return {}; + } + + // For `__clang_trap_msg$category$message$` we expect 3 matches: + // 1. entire string + // 2. category + // 3. message + if (matches.size() != 3) { + LLDB_LOGF(GetLog(LLDBLog::Unwind), + "Unexpected function name format. Expected '$$'$ but got: '%s'.", + func_name.data()); + + return {}; + } + + auto category = matches[1]; + auto message = matches[2]; + + std::string stop_reason = + category.empty() ? "" : category.str(); + if (!message.empty()) { + stop_reason += ": "; + stop_reason += message.str(); + } + + return std::make_shared( + most_relevant_frame_sp, std::move(stop_reason)); +} + +lldb::StackFrameSP VerboseTrapRecognizedStackFrame::GetMostRelevantFrame() { + return m_most_relevant_frame; +} + +namespace lldb_private { + +void RegisterVerboseTrapFrameRecognizer(Process &process) { + RegularExpressionSP module_regex_sp = nullptr; + auto symbol_regex_sp = std::make_shared( + llvm::formatv("^{0}", ClangTrapPrefix).str()); + + StackFrameRecognizerSP srf_recognizer_sp = + std::make_shared(); + + process.GetTarget().GetFrameRecognizerManager().AddRecognizer( + srf_recognizer_sp, module_regex_sp, symbol_regex_sp, false); +} + +} // namespace lldb_private diff --git a/lldb/source/Utility/Event.cpp b/lldb/source/Utility/Event.cpp index 863167e56bce6f..5f431c0a6dd899 100644 --- a/lldb/source/Utility/Event.cpp +++ b/lldb/source/Utility/Event.cpp @@ -83,14 +83,20 @@ void Event::Dump(Stream *s) const { void Event::DoOnRemoval() { std::lock_guard guard(m_listeners_mutex); - if (m_data_sp) - m_data_sp->DoOnRemoval(this); + if (!m_data_sp) + return; + + m_data_sp->DoOnRemoval(this); + // Now that the event has been handled by the primary event Listener, forward // it to the other Listeners. + EventSP me_sp = shared_from_this(); - for (auto listener_sp : m_pending_listeners) - listener_sp->AddEvent(me_sp); - m_pending_listeners.clear(); + if (m_data_sp->ForwardEventToPendingListeners(this)) { + for (auto listener_sp : m_pending_listeners) + listener_sp->AddEvent(me_sp); + m_pending_listeners.clear(); + } } #pragma mark - diff --git a/lldb/test/API/debuginfod/Normal/Makefile b/lldb/test/API/debuginfod/Normal/Makefile deleted file mode 100644 index 54bd7adae241f5..00000000000000 --- a/lldb/test/API/debuginfod/Normal/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -C_SOURCES := main.c - -# For normal (non DWP) Debuginfod tests, we need: - -# * The full binary: a.out.unstripped -# Produced by Makefile.rules with SAVE_FULL_DEBUG_BINARY set to YES and -# SPLIT_DEBUG_SYMBOLS set to YES - -# * The stripped binary (a.out) -# Produced by Makefile.rules with SPLIT_DEBUG_SYMBOLS set to YES - -# * The 'only-keep-debug' binary (a.out.debug) -# Produced below - -SPLIT_DEBUG_SYMBOLS := YES -SAVE_FULL_DEBUG_BINARY := YES -GEN_GNU_BUILD_ID := YES - -include Makefile.rules diff --git a/lldb/test/API/debuginfod/Normal/TestDebuginfod.py b/lldb/test/API/debuginfod/Normal/TestDebuginfod.py deleted file mode 100644 index 1860c56ef3e99b..00000000000000 --- a/lldb/test/API/debuginfod/Normal/TestDebuginfod.py +++ /dev/null @@ -1,186 +0,0 @@ -import os -import shutil -import tempfile - -import lldb -from lldbsuite.test.decorators import * -import lldbsuite.test.lldbutil as lldbutil -from lldbsuite.test.lldbtest import * - - -""" -Test support for the DebugInfoD network symbol acquisition protocol. -This one is for simple / no split-dwarf scenarios. - -For no-split-dwarf scenarios, there are 2 variations: -1 - A stripped binary with it's corresponding unstripped binary: -2 - A stripped binary with a corresponding --only-keep-debug symbols file -""" - - -class DebugInfodTests(TestBase): - # No need to try every flavor of debug inf. - NO_DEBUG_INFO_TESTCASE = True - - @skipUnlessPlatform(["linux", "freebsd"]) - def test_normal_no_symbols(self): - """ - Validate behavior with no symbols or symbol locator. - ('baseline negative' behavior) - """ - test_root = self.config_test(["a.out"]) - self.try_breakpoint(False) - - @skipUnlessPlatform(["linux", "freebsd"]) - def test_normal_default(self): - """ - Validate behavior with symbols, but no symbol locator. - ('baseline positive' behavior) - """ - test_root = self.config_test(["a.out", "a.out.debug"]) - self.try_breakpoint(True) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux", "freebsd"]) - def test_debuginfod_symbols(self): - """ - Test behavior with the full binary available from Debuginfod as - 'debuginfo' from the plug-in. - """ - test_root = self.config_test(["a.out"], "a.out.unstripped") - self.try_breakpoint(True) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux", "freebsd"]) - def test_debuginfod_executable(self): - """ - Test behavior with the full binary available from Debuginfod as - 'executable' from the plug-in. - """ - test_root = self.config_test(["a.out"], None, "a.out.unstripped") - self.try_breakpoint(True) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux", "freebsd"]) - def test_debuginfod_okd_symbols(self): - """ - Test behavior with the 'only-keep-debug' symbols available from Debuginfod. - """ - test_root = self.config_test(["a.out"], "a.out.debug") - self.try_breakpoint(True) - - def try_breakpoint(self, should_have_loc): - """ - This function creates a target from self.aout, sets a function-name - breakpoint, and checks to see if we have a file/line location, - as a way to validate that the symbols have been loaded. - should_have_loc specifies if we're testing that symbols have or - haven't been loaded. - """ - target = self.dbg.CreateTarget(self.aout) - self.assertTrue(target and target.IsValid(), "Target is valid") - - bp = target.BreakpointCreateByName("func") - self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid") - self.assertEqual(bp.GetNumLocations(), 1) - - loc = bp.GetLocationAtIndex(0) - self.assertTrue(loc and loc.IsValid(), "Location is valid") - addr = loc.GetAddress() - self.assertTrue(addr and addr.IsValid(), "Loc address is valid") - line_entry = addr.GetLineEntry() - self.assertEqual( - should_have_loc, - line_entry != None and line_entry.IsValid(), - "Loc line entry is valid", - ) - if should_have_loc: - self.assertEqual(line_entry.GetLine(), 4) - self.assertEqual( - line_entry.GetFileSpec().GetFilename(), - self.main_source_file.GetFilename(), - ) - self.dbg.DeleteTarget(target) - shutil.rmtree(self.tmp_dir) - - def config_test(self, local_files, debuginfo=None, executable=None): - """ - Set up a test with local_files[] copied to a different location - so that we control which files are, or are not, found in the file system. - Also, create a stand-alone file-system 'hosted' debuginfod server with the - provided debuginfo and executable files (if they exist) - - Make the filesystem look like: - - /tmp//test/[local_files] - - /tmp//cache (for lldb to use as a temp cache) - - /tmp//buildid//executable -> - /tmp//buildid//debuginfo -> - Returns the /tmp/ path - """ - - self.build() - - uuid = self.getUUID("a.out") - if not uuid: - self.fail("Could not get UUID for a.out") - return - self.main_source_file = lldb.SBFileSpec("main.c") - self.tmp_dir = tempfile.mkdtemp() - test_dir = os.path.join(self.tmp_dir, "test") - os.makedirs(test_dir) - - self.aout = "" - # Copy the files used by the test: - for f in local_files: - shutil.copy(self.getBuildArtifact(f), test_dir) - # The first item is the binary to be used for the test - if self.aout == "": - self.aout = os.path.join(test_dir, f) - - use_debuginfod = debuginfo != None or executable != None - - # Populated the 'file://... mocked' Debuginfod server: - if use_debuginfod: - os.makedirs(os.path.join(self.tmp_dir, "cache")) - uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid) - os.makedirs(uuid_dir) - if debuginfo: - shutil.copy( - self.getBuildArtifact(debuginfo), - os.path.join(uuid_dir, "debuginfo"), - ) - if executable: - shutil.copy( - self.getBuildArtifact(executable), - os.path.join(uuid_dir, "executable"), - ) - - # Configure LLDB for the test: - self.runCmd( - "settings set symbols.enable-external-lookup %s" - % str(use_debuginfod).lower() - ) - self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls") - if use_debuginfod: - self.runCmd( - "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache" - % self.tmp_dir - ) - self.runCmd( - "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s" - % self.tmp_dir - ) - - def getUUID(self, filename): - try: - spec = lldb.SBModuleSpec() - spec.SetFileSpec(lldb.SBFileSpec(self.getBuildArtifact(filename))) - module = lldb.SBModule(spec) - uuid = module.GetUUIDString().replace("-", "").lower() - # Don't want lldb's fake 32 bit CRC's for this one - return uuid if len(uuid) > 8 else None - except: - return None diff --git a/lldb/test/API/debuginfod/Normal/main.c b/lldb/test/API/debuginfod/Normal/main.c deleted file mode 100644 index 4c7184609b4536..00000000000000 --- a/lldb/test/API/debuginfod/Normal/main.c +++ /dev/null @@ -1,7 +0,0 @@ -// This is a dump little pair of test files - -int func(int argc, const char *argv[]) { - return (argc + 1) * (argv[argc][0] + 2); -} - -int main(int argc, const char *argv[]) { return func(0, argv); } diff --git a/lldb/test/API/debuginfod/SplitDWARF/Makefile b/lldb/test/API/debuginfod/SplitDWARF/Makefile deleted file mode 100644 index 3ab9a969e5a447..00000000000000 --- a/lldb/test/API/debuginfod/SplitDWARF/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -C_SOURCES := main.c - -# For split-dwarf Debuginfod tests, we need: - -# * A .DWP file (a.out.dwp) -# Produced by Makefile.rules with MAKE_DWP set to YES - -# * The "full" binary (missing things that live in .dwo's) (a.out.unstripped) -# Produced by Makefile.rules with SAVE_FULL_DEBUG_BINARY set to YES and -# SPLIT_DEBUG_SYMBOLS set to YES - -# * The stripped binary (a.out) -# Produced by Makefile.rules - -# * The 'only-keep-debug' binary (a.out.debug) -# Produced below - -MAKE_DWP := YES -SPLIT_DEBUG_SYMBOLS := YES -SAVE_FULL_DEBUG_BINARY := YES -GEN_GNU_BUILD_ID := YES - -include Makefile.rules diff --git a/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py b/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py deleted file mode 100644 index 437c83a820fb7f..00000000000000 --- a/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Test support for the DebugInfoD network symbol acquisition protocol. -""" -import os -import shutil -import tempfile - -import lldb -from lldbsuite.test.decorators import * -import lldbsuite.test.lldbutil as lldbutil -from lldbsuite.test.lldbtest import * - - -""" -Test support for the DebugInfoD network symbol acquisition protocol. -This file is for split-dwarf (dwp) scenarios. - -1 - A split binary target with it's corresponding DWP file -2 - A stripped, split binary target with an unstripped binary and a DWP file -3 - A stripped, split binary target with an --only-keep-debug symbols file and a DWP file -""" - - -class DebugInfodDWPTests(TestBase): - # No need to try every flavor of debug inf. - NO_DEBUG_INFO_TESTCASE = True - - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_normal_stripped(self): - """ - Validate behavior with a stripped binary, no symbols or symbol locator. - """ - self.config_test(["a.out"]) - self.try_breakpoint(False) - - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_normal_stripped_split_with_dwp(self): - """ - Validate behavior with symbols, but no symbol locator. - """ - self.config_test(["a.out", "a.out.debug", "a.out.dwp"]) - self.try_breakpoint(True) - - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_normal_stripped_only_dwp(self): - """ - Validate behavior *with* dwp symbols only, but missing other symbols, - but no symbol locator. This shouldn't work: without the other symbols - DWO's appear mostly useless. - """ - self.config_test(["a.out", "a.out.dwp"]) - self.try_breakpoint(False) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_debuginfod_dwp_from_service(self): - """ - Test behavior with the unstripped binary, and DWP from the service. - """ - self.config_test(["a.out.debug"], "a.out.dwp") - self.try_breakpoint(True) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_debuginfod_both_symfiles_from_service(self): - """ - Test behavior with a stripped binary, with the unstripped binary and - dwp symbols from Debuginfod. - """ - self.config_test(["a.out"], "a.out.dwp", "a.out.unstripped") - self.try_breakpoint(True) - - @skipIfCurlSupportMissing - @skipUnlessPlatform(["linux_freebsd_but_old_dwp_tools_on_build_bots_are_broken"]) - def test_debuginfod_both_okd_symfiles_from_service(self): - """ - Test behavior with both the only-keep-debug symbols and the dwp symbols - from Debuginfod. - """ - self.config_test(["a.out"], "a.out.dwp", "a.out.debug") - self.try_breakpoint(True) - - def try_breakpoint(self, should_have_loc): - """ - This function creates a target from self.aout, sets a function-name - breakpoint, and checks to see if we have a file/line location, - as a way to validate that the symbols have been loaded. - should_have_loc specifies if we're testing that symbols have or - haven't been loaded. - """ - target = self.dbg.CreateTarget(self.aout) - self.assertTrue(target and target.IsValid(), "Target is valid") - - bp = target.BreakpointCreateByName("func") - self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid") - self.assertEqual(bp.GetNumLocations(), 1) - - loc = bp.GetLocationAtIndex(0) - self.assertTrue(loc and loc.IsValid(), "Location is valid") - addr = loc.GetAddress() - self.assertTrue(addr and addr.IsValid(), "Loc address is valid") - line_entry = addr.GetLineEntry() - self.assertEqual( - should_have_loc, - line_entry != None and line_entry.IsValid(), - "Loc line entry is valid", - ) - if should_have_loc: - self.assertEqual(line_entry.GetLine(), 4) - self.assertEqual( - line_entry.GetFileSpec().GetFilename(), - self.main_source_file.GetFilename(), - ) - self.dbg.DeleteTarget(target) - shutil.rmtree(self.tmp_dir) - - def config_test(self, local_files, debuginfo=None, executable=None): - """ - Set up a test with local_files[] copied to a different location - so that we control which files are, or are not, found in the file system. - Also, create a stand-alone file-system 'hosted' debuginfod server with the - provided debuginfo and executable files (if they exist) - - Make the filesystem look like: - - /tmp//test/[local_files] - - /tmp//cache (for lldb to use as a temp cache) - - /tmp//buildid//executable -> - /tmp//buildid//debuginfo -> - Returns the /tmp/ path - """ - - self.build() - - uuid = self.getUUID("a.out") - if not uuid: - self.fail("Could not get UUID for a.out") - return - self.main_source_file = lldb.SBFileSpec("main.c") - self.tmp_dir = tempfile.mkdtemp() - self.test_dir = os.path.join(self.tmp_dir, "test") - os.makedirs(self.test_dir) - - self.aout = "" - # Copy the files used by the test: - for f in local_files: - shutil.copy(self.getBuildArtifact(f), self.test_dir) - if self.aout == "": - self.aout = os.path.join(self.test_dir, f) - - use_debuginfod = debuginfo != None or executable != None - - # Populated the 'file://... mocked' Debuginfod server: - if use_debuginfod: - os.makedirs(os.path.join(self.tmp_dir, "cache")) - uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid) - os.makedirs(uuid_dir) - if debuginfo: - shutil.copy( - self.getBuildArtifact(debuginfo), - os.path.join(uuid_dir, "debuginfo"), - ) - if executable: - shutil.copy( - self.getBuildArtifact(executable), - os.path.join(uuid_dir, "executable"), - ) - os.remove(self.getBuildArtifact("main.dwo")) - # Configure LLDB for the test: - self.runCmd( - "settings set symbols.enable-external-lookup %s" - % str(use_debuginfod).lower() - ) - self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls") - if use_debuginfod: - self.runCmd( - "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache" - % self.tmp_dir - ) - self.runCmd( - "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s" - % self.tmp_dir - ) - - def getUUID(self, filename): - try: - spec = lldb.SBModuleSpec() - spec.SetFileSpec(lldb.SBFileSpec(self.getBuildArtifact(filename))) - module = lldb.SBModule(spec) - uuid = module.GetUUIDString().replace("-", "").lower() - # Don't want lldb's fake 32 bit CRC's for this one - return uuid if len(uuid) > 8 else None - except: - return None diff --git a/lldb/test/API/debuginfod/SplitDWARF/main.c b/lldb/test/API/debuginfod/SplitDWARF/main.c deleted file mode 100644 index 4c7184609b4536..00000000000000 --- a/lldb/test/API/debuginfod/SplitDWARF/main.c +++ /dev/null @@ -1,7 +0,0 @@ -// This is a dump little pair of test files - -int func(int argc, const char *argv[]) { - return (argc + 1) * (argv[argc][0] + 2); -} - -int main(int argc, const char *argv[]) { return func(0, argv); } diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/Makefile similarity index 100% rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/Makefile rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/Makefile diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/TestDataFormatterLibcxxStringSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py similarity index 100% rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/TestDataFormatterLibcxxStringSimulator.py rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp similarity index 84% rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/main.cpp rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp index 33e71044482a75..7beeb9c39de49e 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/simulator/main.cpp +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -32,37 +34,6 @@ namespace std { namespace __lldb { -template ::value && !std::is_final<_Tp>::value> -struct __compressed_pair_elem { - explicit __compressed_pair_elem(_Tp __t) : __value_(__t) {} - - _Tp &__get() { return __value_; } - -private: - _Tp __value_; -}; - -template -struct __compressed_pair_elem<_Tp, _Idx, true> : private _Tp { - explicit __compressed_pair_elem(_Tp __t) : _Tp(__t) {} - - _Tp &__get() { return *this; } -}; - -template -class __compressed_pair : private __compressed_pair_elem<_T1, 0>, - private __compressed_pair_elem<_T2, 1> { -public: - using _Base1 = __compressed_pair_elem<_T1, 0>; - using _Base2 = __compressed_pair_elem<_T2, 1>; - - explicit __compressed_pair(_T1 __t1, _T2 __t2) : _Base1(__t1), _Base2(__t2) {} - - _T1 &first() { return static_cast<_Base1 &>(*this).__get(); } -}; - #if defined(ALTERNATE_LAYOUT) && defined(SUBCLASS_PADDING) template struct __padding { unsigned char __xx[sizeof(_CharT) - 1]; @@ -212,7 +183,7 @@ template class basic_string { }; }; - __compressed_pair<__rep, allocator_type> __r_; + std::__lldb::__compressed_pair<__rep, allocator_type> __r_; public: template diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/Makefile new file mode 100644 index 00000000000000..38cfa81053488c --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp +override CXXFLAGS_EXTRAS += -std=c++14 +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py new file mode 100644 index 00000000000000..da780f54bfd374 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py @@ -0,0 +1,24 @@ +""" +Test we can understand various layouts of the libc++'s std::unique_ptr +""" + + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class LibcxxUniquePtrDataFormatterSimulatorTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test(self): + self.build() + lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec("main.cpp") + ) + self.expect("frame variable var_up", substrs=["pointer ="]) + self.expect("frame variable var_up", substrs=["deleter ="], matching=False) + self.expect( + "frame variable var_with_deleter_up", substrs=["pointer =", "deleter ="] + ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp new file mode 100644 index 00000000000000..08324e24f9cc4d --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp @@ -0,0 +1,37 @@ +#include + +namespace std { +namespace __lldb { +template struct default_delete { + default_delete() noexcept = default; + + void operator()(_Tp *__ptr) const noexcept { delete __ptr; } +}; + +template > class unique_ptr { +public: + typedef _Tp element_type; + typedef _Dp deleter_type; + typedef _Tp *pointer; + + std::__lldb::__compressed_pair __ptr_; + explicit unique_ptr(pointer __p) noexcept + : __ptr_(__p, std::__lldb::__value_init_tag()) {} +}; +} // namespace __lldb +} // namespace std + +struct StatefulDeleter { + StatefulDeleter() noexcept = default; + + void operator()(int *__ptr) const noexcept { delete __ptr; } + + int m_state = 50; +}; + +int main() { + std::__lldb::unique_ptr var_up(new int(5)); + std::__lldb::unique_ptr var_with_deleter_up(new int(5)); + __builtin_printf("Break here\n"); + return 0; +} diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py index e52fb8c87377ff..cfbfaff10de3cc 100644 --- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py +++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py @@ -62,7 +62,7 @@ def copy_shlibs_to_remote(self, hidden_dir=False): for f in shlibs: err = lldb.remote_platform.Put( lldb.SBFileSpec(self.getBuildArtifact(f)), - lldb.SBFileSpec(os.path.join(wd, f)), + lldb.SBFileSpec(lldbutil.join_remote_paths(wd, f)), ) if err.Fail(): raise RuntimeError( @@ -71,7 +71,7 @@ def copy_shlibs_to_remote(self, hidden_dir=False): if hidden_dir: shlib = "libloadunload_d." + ext hidden_dir = os.path.join(wd, "hidden") - hidden_file = os.path.join(hidden_dir, shlib) + hidden_file = lldbutil.join_remote_paths(hidden_dir, shlib) err = lldb.remote_platform.MakeDirectory(hidden_dir) if err.Fail(): raise RuntimeError( @@ -405,8 +405,10 @@ def run_step_over_load(self): # We can't find a breakpoint location for d_init before launching because # executable dependencies are resolved relative to the debuggers PWD. Bug? + # The remote lldb server resolves the executable dependencies correctly. @expectedFailureAll( - oslist=["freebsd", "linux", "netbsd"], triple=no_match("aarch64-.*-android") + oslist=["freebsd", "linux", "netbsd"], + remote=False, ) @expectedFailureAll(oslist=["windows"], archs=["aarch64"]) def test_static_init_during_load(self): diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py index d8d3dd2d2b01b8..fb1a7e3bc6d3ac 100644 --- a/lldb/test/API/python_api/event/TestEvents.py +++ b/lldb/test/API/python_api/event/TestEvents.py @@ -7,7 +7,7 @@ from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil - +import random @skipIfLinux # llvm.org/pr25924, sometimes generating SIGSEGV class EventAPITestCase(TestBase): @@ -20,6 +20,7 @@ def setUp(self): self.line = line_number( "main.c", '// Find the line number of function "c" here.' ) + random.seed() @expectedFailureAll( oslist=["linux"], bugnumber="llvm.org/pr23730 Flaky, fails ~1/10 cases" @@ -318,6 +319,7 @@ def wait_for_next_event(self, expected_state, test_shadow=False): """Wait for an event from self.primary & self.shadow listener. If test_shadow is true, we also check that the shadow listener only receives events AFTER the primary listener does.""" + import stop_hook # Waiting on the shadow listener shouldn't have events yet because # we haven't fetched them for the primary listener yet: event = lldb.SBEvent() @@ -328,12 +330,23 @@ def wait_for_next_event(self, expected_state, test_shadow=False): # But there should be an event for the primary listener: success = self.primary_listener.WaitForEvent(5, event) + self.assertTrue(success, "Primary listener got the event") state = lldb.SBProcess.GetStateFromEvent(event) + primary_event_type = event.GetType() restart = False if state == lldb.eStateStopped: restart = lldb.SBProcess.GetRestartedFromEvent(event) + # This counter is matching the stop hooks, which don't get run + # for auto-restarting stops. + if not restart: + self.stop_counter += 1 + self.assertEqual( + stop_hook.StopHook.counter[self.instance], + self.stop_counter, + "matching stop hook", + ) if expected_state is not None: self.assertEqual( @@ -344,15 +357,18 @@ def wait_for_next_event(self, expected_state, test_shadow=False): # listener: success = self.shadow_listener.WaitForEvent(5, event) self.assertTrue(success, "Shadow listener got event too") + shadow_event_type = event.GetType() + self.assertEqual( + primary_event_type, shadow_event_type, "It was the same event type" + ) self.assertEqual( - state, lldb.SBProcess.GetStateFromEvent(event), "It was the same event" + state, lldb.SBProcess.GetStateFromEvent(event), "It was the same state" ) self.assertEqual( restart, lldb.SBProcess.GetRestartedFromEvent(event), "It was the same restarted", ) - return state, restart @expectedFlakeyLinux("llvm.org/pr23730") # Flaky, fails ~1/100 cases @@ -386,6 +402,20 @@ def test_shadow_listener(self): ) self.dbg.SetAsync(True) + # Now make our stop hook - we want to ensure it stays up to date with + # the events. We can't get our hands on the stop-hook instance directly, + # so we'll pass in an instance key, and use that to retrieve the data from + # this instance of the stop hook: + self.instance = f"Key{random.randint(0,10000)}" + stop_hook_path = os.path.join(self.getSourceDir(), "stop_hook.py") + self.runCmd(f"command script import {stop_hook_path}") + import stop_hook + + self.runCmd( + f"target stop-hook add -P stop_hook.StopHook -k instance -v {self.instance}" + ) + self.stop_counter = 0 + self.process = target.Launch(launch_info, error) self.assertSuccess(error, "Process launched successfully") @@ -395,6 +425,7 @@ def test_shadow_listener(self): # Events in the launch sequence might be platform dependent, so don't # expect any particular event till we get the stopped: state = lldb.eStateInvalid + while state != lldb.eStateStopped: state, restart = self.wait_for_next_event(None, False) @@ -412,8 +443,6 @@ def test_shadow_listener(self): self.cur_thread.GetStopReasonDataAtIndex(0), "Hit the right breakpoint", ) - # Disable the first breakpoint so it doesn't get in the way... - bkpt1.SetEnabled(False) self.cur_thread.StepOver() # We'll run the test for "shadow listener blocked by primary listener @@ -450,4 +479,9 @@ def test_shadow_listener(self): ) if state == lldb.eStateStopped and not restarted: self.process.Continue() + state, restarted = self.wait_for_next_event(None, False) + + # Now make sure we agree with the stop hook counter: + self.assertEqual(self.stop_counter, stop_hook.StopHook.counter[self.instance]) + self.assertEqual(stop_hook.StopHook.non_stops[self.instance], 0, "No non stops") diff --git a/lldb/test/API/python_api/event/stop_hook.py b/lldb/test/API/python_api/event/stop_hook.py new file mode 100644 index 00000000000000..932fa913366bce --- /dev/null +++ b/lldb/test/API/python_api/event/stop_hook.py @@ -0,0 +1,35 @@ +import lldb +import time + +class StopHook: + # These dictionaries are used to pass data back to the test case. + # Since these are global, we need to know which test run is which. + # The test passes a key in the extra_args, we use that as the key + # for these dictionaries, and then the test can fetch out the right + # one. + counter = {} + non_stops = {} + def __init__(self, target, extra_args, dict): + self.target = target + self.regs = {} + self.instance = extra_args.GetValueForKey("instance").GetStringValue(100) + StopHook.counter[self.instance] = 0 + StopHook.non_stops[self.instance] = 0 + + def handle_stop(self, exe_ctx, stream): + import time + # All this stop hook does is sleep a bit and count. There was a bug + # where we were sending the secondary listener events when the + # private state thread's DoOnRemoval completed, rather than when + # the primary public process Listener consumes the event. That + # became really clear when a stop hook artificially delayed the + # delivery of the primary listener's event - since IT had to come + # after the stop hook ran. + time.sleep(0.5) + StopHook.counter[self.instance] += 1 + # When we were sending events too early, one symptom was the stop + # event would get triggered before the state had been changed. + # Watch for that here. + if exe_ctx.process.GetState() != lldb.eStateStopped: + StopHook.non_stops[self.instance] += 1 + diff --git a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py index 7e802f92da352a..c01c466b70c82a 100644 --- a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py +++ b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py @@ -143,6 +143,19 @@ def cleanup(): self.dbg.GetCategory("JASSynth").SetEnabled(True) self.expect("frame variable foo", matching=True, substrs=["X = 1"]) + self.dbg.GetCategory("CCCSynth2").SetEnabled(True) + self.expect( + "frame variable ccc", + matching=True, + substrs=[ + "CCC object with leading synthetic value (int) b = 222", + "a = 111", + "b = 222", + "c = 333", + ], + ) + self.dbg.GetCategory("CCCSynth2").SetEnabled(False) + self.dbg.GetCategory("CCCSynth").SetEnabled(True) self.expect( "frame variable ccc", @@ -155,6 +168,15 @@ def cleanup(): ], ) + self.dbg.GetCategory("BarIntSynth").SetEnabled(True) + self.expect( + "frame variable bar_int", + matching=True, + substrs=[ + "(int) bar_int = 20 bar_int synthetic: No value", + ], + ) + foo_var = ( self.dbg.GetSelectedTarget() .GetProcess() diff --git a/lldb/test/API/python_api/formatters/main.cpp b/lldb/test/API/python_api/formatters/main.cpp index f21c956144c29b..50c29657a09a9d 100644 --- a/lldb/test/API/python_api/formatters/main.cpp +++ b/lldb/test/API/python_api/formatters/main.cpp @@ -52,6 +52,8 @@ int main(int argc, char const *argv[]) { CCC ccc = {111, 222, 333}; + int bar_int = 20; + Empty1 e1; Empty2 e2; diff --git a/lldb/test/API/python_api/formatters/synth.py b/lldb/test/API/python_api/formatters/synth.py index 474c18bc62ebd9..91afb26af84361 100644 --- a/lldb/test/API/python_api/formatters/synth.py +++ b/lldb/test/API/python_api/formatters/synth.py @@ -29,11 +29,28 @@ def ccc_summary(sbvalue, internal_dict): # This tests that the SBValue.GetNonSyntheticValue() actually returns a # non-synthetic value. If it does not, then sbvalue.GetChildMemberWithName("a") # in the following statement will call the 'get_child_index' method of the - # synthetic child provider CCCSynthProvider below (which raises an - # exception). + # synthetic child provider CCCSynthProvider below (which return the "b" field"). return "CCC object with leading value " + str(sbvalue.GetChildMemberWithName("a")) +def ccc_synthetic(sbvalue, internal_dict): + sbvalue = sbvalue.GetSyntheticValue() + # This tests that the SBValue.GetSyntheticValue() actually returns a + # synthetic value. If it does, then sbvalue.GetChildMemberWithName("a") + # in the following statement will call the 'get_child_index' method of the + # synthetic child provider CCCSynthProvider below (which return the "b" field"). + return "CCC object with leading synthetic value " + str( + sbvalue.GetChildMemberWithName("a") + ) + + +def bar_int_synthetic(sbvalue, internal_dict): + sbvalue = sbvalue.GetSyntheticValue() + # This tests that the SBValue.GetSyntheticValue() actually returns no + # value when the value has no synthetic representation. + return "bar_int synthetic: " + str(sbvalue) + + class CCCSynthProvider(object): def __init__(self, sbvalue, internal_dict): self._sbvalue = sbvalue @@ -42,6 +59,9 @@ def num_children(self): return 3 def get_child_index(self, name): + if name == "a": + # Return b for test. + return 1 raise RuntimeError("I don't want to be called!") def get_child_at_index(self, index): @@ -119,3 +139,23 @@ def __lldb_init_module(debugger, dict): "synth.empty2_summary", lldb.eTypeOptionHideEmptyAggregates ), ) + cat2 = debugger.CreateCategory("CCCSynth2") + cat2.AddTypeSynthetic( + lldb.SBTypeNameSpecifier("CCC"), + lldb.SBTypeSynthetic.CreateWithClassName( + "synth.CCCSynthProvider", lldb.eTypeOptionCascade + ), + ) + cat2.AddTypeSummary( + lldb.SBTypeNameSpecifier("CCC"), + lldb.SBTypeSummary.CreateWithFunctionName( + "synth.ccc_synthetic", lldb.eTypeOptionCascade + ), + ) + cat3 = debugger.CreateCategory("BarIntSynth") + cat3.AddTypeSummary( + lldb.SBTypeNameSpecifier("int"), + lldb.SBTypeSummary.CreateWithFunctionName( + "synth.bar_int_synthetic", lldb.eTypeOptionCascade + ), + ) diff --git a/lldb/test/API/riscv/break-undecoded/Makefile b/lldb/test/API/riscv/break-undecoded/Makefile new file mode 100644 index 00000000000000..10495940055b63 --- /dev/null +++ b/lldb/test/API/riscv/break-undecoded/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/riscv/break-undecoded/TestBreakpointIllegal.py b/lldb/test/API/riscv/break-undecoded/TestBreakpointIllegal.py new file mode 100644 index 00000000000000..41e8901bf84ab4 --- /dev/null +++ b/lldb/test/API/riscv/break-undecoded/TestBreakpointIllegal.py @@ -0,0 +1,44 @@ +""" +Test that we can set up software breakpoint even if we failed to decode and execute instruction +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestBreakpointIllegal(TestBase): + @skipIf(archs=no_match(["rv64gc"])) + def test_4(self): + self.build() + (target, process, cur_thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "main", lldb.SBFileSpec("main.c") + ) + self.runCmd("thread step-inst") + # we need to step more, as some compilers do not set appropriate debug info. + while cur_thread.GetStopDescription(256) == "instruction step into": + self.runCmd("thread step-inst") + # The stop reason of the thread should be illegal opcode. + self.expect( + "thread list", + STOPPED_DUE_TO_SIGNAL, + substrs=["stopped", "stop reason = signal SIGILL: illegal opcode"], + ) + + @skipIf(archs=no_match(["rv64gc"])) + def test_2(self): + self.build(dictionary={"C_SOURCES": "compressed.c", "EXE": "compressed.x"}) + (target, process, cur_thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "main", lldb.SBFileSpec("compressed.c"), exe_name="compressed.x" + ) + self.runCmd("thread step-inst") + # we need to step more, as some compilers do not set appropriate debug info. + while cur_thread.GetStopDescription(256) == "instruction step into": + self.runCmd("thread step-inst") + # The stop reason of the thread should be illegal opcode. + self.expect( + "thread list", + STOPPED_DUE_TO_SIGNAL, + substrs=["stopped", "stop reason = signal SIGILL: illegal opcode"], + ) diff --git a/lldb/test/API/riscv/break-undecoded/compressed.c b/lldb/test/API/riscv/break-undecoded/compressed.c new file mode 100644 index 00000000000000..a82ce9893cdb69 --- /dev/null +++ b/lldb/test/API/riscv/break-undecoded/compressed.c @@ -0,0 +1,7 @@ +int main() { + // This instruction is not valid, but we have an ability to set + // software breakpoint. + // This results in illegal instruction during execution, not fail to set + // breakpoint + asm volatile(".2byte 0xaf"); +} diff --git a/lldb/test/API/riscv/break-undecoded/main.c b/lldb/test/API/riscv/break-undecoded/main.c new file mode 100644 index 00000000000000..747923071e632e --- /dev/null +++ b/lldb/test/API/riscv/break-undecoded/main.c @@ -0,0 +1,7 @@ +int main() { + // This instruction is not valid, but we have an ability to set + // software breakpoint. + // This results in illegal instruction during execution, not fail to set + // breakpoint + asm volatile(".4byte 0xc58573" : :); +} diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index b1b3d05ed45489..dd47a2db8709b9 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -475,3 +475,34 @@ def test_terminate_commands(self): pattern=terminateCommands[0], ) self.verify_commands("terminateCommands", output, terminateCommands) + + @skipIfWindows + def test_version(self): + """ + Tests that "initialize" response contains the "version" string the same + as the one returned by "version" command. + """ + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + + source = "main.c" + breakpoint_line = line_number(source, "// breakpoint 1") + lines = [breakpoint_line] + # Set breakpoint in the thread function so we can step the threads + breakpoint_ids = self.set_source_breakpoints(source, lines) + self.continue_to_breakpoints(breakpoint_ids) + + version_eval_response = self.dap_server.request_evaluate( + "`version", context="repl" + ) + version_eval_output = version_eval_response["body"]["result"] + + # The first line is the prompt line like "(lldb) version", so we skip it. + version_eval_output_without_prompt_line = version_eval_output.splitlines()[1:] + lldb_json = self.dap_server.get_initialize_value("__lldb") + version_string = lldb_json["version"] + self.assertEqual( + version_eval_output_without_prompt_line, + version_string.splitlines(), + "version string does not match", + ) diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py index 6296f6554d07e5..07acfe07c9ffce 100644 --- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py +++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py @@ -10,9 +10,11 @@ class TestDAP_stepInTargets(lldbdap_testcase.DAPTestCaseBase): - @skipIf( - archs=no_match(["x86_64"]) - ) # InstructionControlFlowKind for ARM is not supported yet. + @expectedFailureAll(oslist=["windows"]) + @skipIf(archs=no_match(["x86_64"])) + # InstructionControlFlowKind for ARM is not supported yet. + # On Windows, lldb-dap seems to ignore targetId when stepping into functions. + # For more context, see https://github.com/llvm/llvm-project/issues/98509. def test_basic(self): """ Tests the basic stepping in targets with directly calls. @@ -55,14 +57,24 @@ def test_basic(self): self.assertEqual(len(step_in_targets), 3, "expect 3 step in targets") # Verify the target names are correct. - self.assertEqual(step_in_targets[0]["label"], "bar()", "expect bar()") - self.assertEqual(step_in_targets[1]["label"], "bar2()", "expect bar2()") - self.assertEqual( - step_in_targets[2]["label"], "foo(int, int)", "expect foo(int, int)" - ) + # The order of funcA and funcB may change depending on the compiler ABI. + funcA_target = None + funcB_target = None + for target in step_in_targets[0:2]: + if "funcB" in target["label"]: + funcB_target = target + elif "funcA" in target["label"]: + funcA_target = target + else: + self.fail(f"Unexpected step in target: {target}") + + self.assertIsNotNone(funcA_target, "expect funcA") + self.assertIsNotNone(funcB_target, "expect funcB") + self.assertIn("foo", step_in_targets[2]["label"], "expect foo") - # Choose to step into second target and verify that we are in bar2() + # Choose to step into second target and verify that we are in the second target, + # be it funcA or funcB. self.stepIn(threadId=tid, targetId=step_in_targets[1]["id"], waitForStop=True) leaf_frame = self.dap_server.get_stackFrame() self.assertIsNotNone(leaf_frame, "expect a leaf frame") - self.assertEqual(leaf_frame["name"], "bar2()") + self.assertEqual(step_in_targets[1]["label"], leaf_frame["name"]) diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp b/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp index d3c3dbcc139ef0..a48b79af0c7605 100644 --- a/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp +++ b/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp @@ -1,11 +1,11 @@ int foo(int val, int extra) { return val + extra; } -int bar() { return 22; } +int funcA() { return 22; } -int bar2() { return 54; } +int funcB() { return 54; } int main(int argc, char const *argv[]) { - foo(bar(), bar2()); // set breakpoint here + foo(funcA(), funcB()); // set breakpoint here return 0; } diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.cpp new file mode 100644 index 00000000000000..89962b54379980 --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.cpp @@ -0,0 +1,12 @@ +#if !defined(VERBOSE_TRAP_TEST_CATEGORY) || !defined(VERBOSE_TRAP_TEST_MESSAGE) +#error Please define required macros +#endif + +struct Dummy { + void func() { __builtin_verbose_trap(VERBOSE_TRAP_TEST_CATEGORY, VERBOSE_TRAP_TEST_MESSAGE); } +}; + +int main() { + Dummy{}.func(); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test new file mode 100644 index 00000000000000..45ef84bef611fe --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap.test @@ -0,0 +1,22 @@ +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\" +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-MESSAGE_ONLY +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-CATEGORY_ONLY +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\" +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-NONE + +run +# CHECK-BOTH: thread #{{.*}}stop reason = Foo: Bar +# CHECK-MESSAGE_ONLY: thread #{{.*}}stop reason = : Bar +# CHECK-CATEGORY_ONLY: thread #{{.*}}stop reason = Foo +# CHECK-NONE: thread #{{.*}}stop reason = +frame info +# CHECK: frame #{{.*}}`Dummy::func(this={{.*}}) at verbose_trap.cpp +frame recognizer info 0 +# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer +q diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test new file mode 100644 index 00000000000000..d253981b498c81 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test @@ -0,0 +1,36 @@ +# Test definition DIE searching is delayed until complete type is required. + +# UNSUPPORTED: system-windows + +# RUN: split-file %s %t +# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out +# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s + +# CHECK: (lldb) p v1 +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2' +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' +# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2' resolving forward declaration... +# CHECK: (t2) {} +# CHECK: (lldb) p v2 +# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' +# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration... + +#--- lldb.cmd +log enable dwarf comp +p v1 +p v2 + +#--- main.cpp +template +struct t2 { +}; +struct t1; +t2 v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it. +int main() { +} + +#--- t1_def.cpp +struct t1 { // this CU contains definition DIE for t1. + int x; +}; +t1 v2; diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp similarity index 77% rename from lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp rename to lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp index 8070b7a19abccc..bce6ed36b0968e 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp @@ -9,6 +9,11 @@ // RUN: ld.lld %t-a.o %t-b.o -o %t // RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -DFILE_A +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -DFILE_B +// RUN: ld.lld %t-a.o %t-b.o -o %t +// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s + // CHECK: (lldb) target variable // CHECK-NEXT: (ReferencesBoth<'A'>) both_a = { // CHECK-NEXT: (Outer<'A'>::Inner *) a = 0x{{[0-9A-Fa-f]*}} {} @@ -19,13 +24,11 @@ // CHECK-NEXT: (Outer<'B'>::Inner *) b = 0x{{[0-9A-Fa-f]*}} {} // CHECK-NEXT: } -template -struct Outer { +template struct Outer { struct Inner {}; }; -template -struct ReferencesBoth { +template struct ReferencesBoth { Outer<'A'>::Inner *a; Outer<'B'>::Inner *b; }; diff --git a/lldb/test/Shell/SymbolFile/add-dsym.test b/lldb/test/Shell/SymbolFile/add-dsym.test index cdcba641957d1e..52d1a1363feef0 100644 --- a/lldb/test/Shell/SymbolFile/add-dsym.test +++ b/lldb/test/Shell/SymbolFile/add-dsym.test @@ -1,5 +1,8 @@ # REQUIRES: system-darwin +# RUN: %lldb -o 'help add-dsym' | FileCheck %s --check-prefix=HELP +# HELP: Syntax: add-dsym + # RUN: yaml2obj %S/Inputs/a.yaml -o %t.out # RUN: LLDB_APPLE_DSYMFORUUID_EXECUTABLE=%S/Inputs/dsymforuuid.sh %lldb %t.out -o 'add-dsym -u 41945CA4-5D9D-3CDE-82B4-37E4C09750B5' 2>&1 | FileCheck %s # CHECK: UUID information was not found diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index b50d40acb51a21..ea84f31aec3a6c 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -1719,6 +1719,11 @@ void request_initialize(const llvm::json::Object &request) { // The debug adapter supports data watchpoints. body.try_emplace("supportsDataBreakpoints", true); + // Put in non-DAP specification lldb specific information. + llvm::json::Object lldb_json; + lldb_json.try_emplace("version", g_dap.debugger.GetVersionString()); + body.try_emplace("__lldb", std::move(lldb_json)); + response.try_emplace("body", std::move(body)); g_dap.SendJSON(llvm::json::Value(std::move(response))); } diff --git a/lldb/unittests/Process/ProcessEventDataTest.cpp b/lldb/unittests/Process/ProcessEventDataTest.cpp index e793c6eae20a29..9f65b71fc1c318 100644 --- a/lldb/unittests/Process/ProcessEventDataTest.cpp +++ b/lldb/unittests/Process/ProcessEventDataTest.cpp @@ -142,6 +142,13 @@ ThreadSP CreateThread(ProcessSP &process_sp, bool should_stop, return thread_sp; } +// Disable this test till I figure out why changing how events are sent +// to Secondary Listeners (44d9692e6a657ec46e98e4912ac56417da67cfee) +// caused this test to fail. It is testing responses to events that are +// not delivered in the way Process events are meant to be delivered, it +// bypasses the private event queue, and I'm not sure is testing real +// behaviors. +#if 0 TEST_F(ProcessEventDataTest, DoOnRemoval) { ArchSpec arch("x86_64-apple-macosx-"); @@ -181,6 +188,7 @@ TEST_F(ProcessEventDataTest, DoOnRemoval) { ->m_should_stop_hit_count == 0; ASSERT_TRUE(result); } +#endif TEST_F(ProcessEventDataTest, ShouldStop) { ArchSpec arch("x86_64-apple-macosx-"); diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst index 289106cd1e28ea..4baf127bf050b0 100644 --- a/llvm/docs/Frontend/PerformanceTips.rst +++ b/llvm/docs/Frontend/PerformanceTips.rst @@ -206,7 +206,9 @@ Other Things to Consider that fact is critical for optimization purposes. Assumes are a great prototyping mechanism, but they can have negative effects on both compile time and optimization effectiveness. The former is fixable with enough - effort, but the later is fairly fundamental to their designed purpose. + effort, but the later is fairly fundamental to their designed purpose. If + you are creating a non-terminator unreachable instruction or passing a false + value, use the ``store i1 true, ptr poison, align 1`` canonical form. Describing Language Specific Properties diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index a04b5769f095fb..40c8b7f7695968 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15026,7 +15026,7 @@ Arguments: """""""""" The first argument is a pointer to the destination, the second is a -pointer to the source. The third argument is a constant integer argument +pointer to the source. The third argument is an integer argument specifying the number of bytes to copy, and the fourth is a boolean indicating a volatile access. diff --git a/llvm/docs/MyFirstTypoFix.rst b/llvm/docs/MyFirstTypoFix.rst index 86ea32c6a441bb..733b3eac141f27 100644 --- a/llvm/docs/MyFirstTypoFix.rst +++ b/llvm/docs/MyFirstTypoFix.rst @@ -9,8 +9,14 @@ Introduction ============ This tutorial will guide you through the process of making a change to -LLVM, and contributing it back to the LLVM project. We'll be making a -change to Clang, but the steps for other parts of LLVM are the same. +LLVM, and contributing it back to the LLVM project. + +.. note:: + The code changes presented here are only an example and not something you + should actually submit to the LLVM project. For your first real change to LLVM, + the code will be different but the rest of the guide will still apply. + +We'll be making a change to Clang, but the steps for other parts of LLVM are the same. Even though the change we'll be making is simple, we're going to cover steps like building LLVM, running the tests, and code review. This is good practice, and you'll be prepared for making larger changes. diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 5fd007582f34da..4474478b6d3f8b 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -83,7 +83,7 @@ ISA naming string. Currently supported profiles: * ``rva22u64`` * ``rva22s64`` -Note that you can also append additional extension names to be enable, e.g. +Note that you can also append additional extension names to be enabled, e.g. ``rva20u64_zicond`` will enable the ``zicond`` extension in addition to those in the ``rva20u64`` profile. @@ -303,7 +303,7 @@ The primary goal of experimental support is to assist in the process of ratifica LLVM implements the `0.0.5 draft specification `__. ``experimental-zicfilp``, ``experimental-zicfiss`` - LLVM implements the `0.4 draft specification `__. + LLVM implements the `1.0 release specification `__. To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using. To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`. Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 55b3b486d705df..311ae0ea255ef6 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -203,6 +203,8 @@ Changes to the RISC-V Backend * Ztso is no longer experimental. * The WCH / Nanjing Qinheng Microelectronics QingKe "XW" compressed opcodes are supported under the name "Xwchc". +* ``-mcpu=native`` now detects available features with hwprobe (RISC-V Hardware Probing Interface) on Linux 6.4 or later. +* The version of Zicfilp/Zicfiss is updated to 1.0. Changes to the WebAssembly Backend ---------------------------------- @@ -216,6 +218,9 @@ Changes to the X86 Backend - Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1, while assembly encoding/decoding supports are kept. +- Removed ``3DNow!``-specific ISA intrinsics and codegen support. The ``3dnow`` and ``3dnowa`` target features are no longer supported. The intrinsics ``llvm.x86.3dnow.*``, ``llvm.x86.3dnowa.*``, and ``llvm.x86.mmx.femms`` have been removed. Assembly encoding/decoding for the corresponding instructions remains supported. + + Changes to the OCaml bindings ----------------------------- @@ -307,6 +312,14 @@ They are described in detail in the `debug info migration guide LLVM IR Value Mapping +Each LLVM IR Value maps to a single Sandbox IR Value. +The reverse is also true in most cases, except for Sandbox IR Instructions that map to more than one LLVM IR Instruction. +Such instructions can be defined in extensions of the base Sandbox IR. + +- Forward mapping: Sandbox IR Value -> LLVM IR Value +Each Sandbox IR Value contains an `llvm::Value *Val` member variable that points to the corresponding LLVM IR Value. + +- Reverse mapping: LLVM IR Value -> Sandbox IR Value +This mapping is stored in `sandboxir::Context::LLVMValueToValue`. + +For example `sandboxir::User::getOperand(OpIdx)` for a `sandboxir::User *U` works as follows: +- First we find the LLVM User: `llvm::User *LLVMU = U->Val`. +- Next we get the LLVM Value operand: `llvm::Value *LLVMOp = LLVMU->getOperand(OpIdx)` +- Finally we get the Sandbox IR operand that corresponds to `LLVMOp` by querying the map in the Sandbox IR context: `retrun Ctx.getValue(LLVMOp)`. + +## Sandbox IR is Write-Through +Sandbox IR is designed to rely on LLVM IR for its state. +So any change made to Sandbox IR objects directly updates the corresponding LLVM IR. + +This has the following benefits: +- It minimizes the replication of state, and +- It makes sure that Sandbox IR and LLVM IR are always in sync, which helps avoid bugs and removes the need for a lowering step. +- No need for serialization/de-serialization infrastructure as we can rely on LLVM IR for it. +- One can pass actual `llvm::Instruction`s to cost modeling APIs. + +Sandbox IR API functions that modify the IR state call the corresponding LLVM IR function that modifies the LLVM IR's state. +For example, for `sandboxir::User::setOperand(OpIdx, sandboxir::Value *Op)`: +- We get the corresponding LLVM User: `llvm::User *LLVMU = cast(Val)` +- Next we get the corresponding LLVM Operand: `llvm::Value *LLVMOp = Op->Val` +- Finally we modify `LLVMU`'s operand: `LLVMU->setOperand(OpIdx, LLVMOp) diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index bf7cdda89a0093..86101ffbd9ca5d 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -67,6 +67,7 @@ intermediate LLVM representation. RISCV/RISCVVectorExtension SourceLevelDebugging SPIRVUsage + SandboxIR StackSafetyAnalysis SupportLibrary TableGen/index @@ -192,6 +193,7 @@ Optimizations This document specifies guidelines for contributions for InstCombine and related passes. + Code Generation --------------- @@ -288,3 +290,6 @@ Additional Topics :doc:`RISCV/RISCVVectorExtension` This document describes how the RISC-V Vector extension can be expressed in LLVM IR and how code is generated for it in the backend. + +:doc:`Sandbox IR ` + This document describes the design and usage of Sandbox IR, a transactional layer over LLVM IR. diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 9867db4839fe19..51033175b2adef 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -510,6 +510,20 @@ enum { */ typedef unsigned LLVMFastMathFlags; +enum { + LLVMGEPFlagInBounds = (1 << 0), + LLVMGEPFlagNUSW = (1 << 1), + LLVMGEPFlagNUW = (1 << 2), +}; + +/** + * Flags that constrain the allowed wrap semantics of a getelementptr + * instruction. + * + * See https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +typedef unsigned LLVMGEPNoWrapFlags; + /** * @} */ @@ -2395,6 +2409,17 @@ LLVMValueRef LLVMConstGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal, LLVMValueRef LLVMConstInBoundsGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal, LLVMValueRef *ConstantIndices, unsigned NumIndices); +/** + * Creates a constant GetElementPtr expression. Similar to LLVMConstGEP2, but + * allows specifying the no-wrap flags. + * + * @see llvm::ConstantExpr::getGetElementPtr() + */ +LLVMValueRef LLVMConstGEPWithNoWrapFlags(LLVMTypeRef Ty, + LLVMValueRef ConstantVal, + LLVMValueRef *ConstantIndices, + unsigned NumIndices, + LLVMGEPNoWrapFlags NoWrapFlags); LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType); LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType); LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType); @@ -3904,6 +3929,20 @@ void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds); */ LLVMTypeRef LLVMGetGEPSourceElementType(LLVMValueRef GEP); +/** + * Get the no-wrap related flags for the given GEP instruction. + * + * @see llvm::GetElementPtrInst::getNoWrapFlags + */ +LLVMGEPNoWrapFlags LLVMGEPGetNoWrapFlags(LLVMValueRef GEP); + +/** + * Set the no-wrap related flags for the given GEP instruction. + * + * @see llvm::GetElementPtrInst::setNoWrapFlags + */ +void LLVMGEPSetNoWrapFlags(LLVMValueRef GEP, LLVMGEPNoWrapFlags NoWrapFlags); + /** * @} */ @@ -4363,6 +4402,17 @@ LLVMValueRef LLVMBuildGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, LLVMValueRef *Indices, unsigned NumIndices, const char *Name); +/** + * Creates a GetElementPtr instruction. Similar to LLVMBuildGEP2, but allows + * specifying the no-wrap flags. + * + * @see llvm::IRBuilder::CreateGEP() + */ +LLVMValueRef LLVMBuildGEPWithNoWrapFlags(LLVMBuilderRef B, LLVMTypeRef Ty, + LLVMValueRef Pointer, + LLVMValueRef *Indices, + unsigned NumIndices, const char *Name, + LLVMGEPNoWrapFlags NoWrapFlags); LLVMValueRef LLVMBuildStructGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, unsigned Idx, const char *Name); diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h index 1c6799f1c56eda..ac40ec4a6b2404 100644 --- a/llvm/include/llvm/ADT/ArrayRef.h +++ b/llvm/include/llvm/ADT/ArrayRef.h @@ -460,11 +460,8 @@ namespace llvm { OwningArrayRef &operator=(OwningArrayRef &&Other) { delete[] this->data(); - using Base = MutableArrayRef; - // GCC versions prior to 11.1 incorrectly reject if the 'template' keyword - // is used prior to the nested-name-specifier here. - this->Base::operator=(Other); - Other.Base::operator=(Base()); + this->MutableArrayRef::operator=(Other); + Other.MutableArrayRef::operator=(MutableArrayRef()); return *this; } diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h index b448685ab6163b..4a6986669c936e 100644 --- a/llvm/include/llvm/ADT/PackedVector.h +++ b/llvm/include/llvm/ADT/PackedVector.h @@ -141,6 +141,8 @@ class PackedVector : public PackedVectorBase> & + DenseMap, + std::pair> & getPointerBounds() { return PointerBounds; } @@ -334,7 +335,9 @@ class MemoryDepChecker { /// Mapping of SCEV expressions to their expanded pointer bounds (pair of /// start and end pointer expressions). - DenseMap> PointerBounds; + DenseMap, + std::pair> + PointerBounds; /// Check whether there is a plausible dependence between the two /// accesses. diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index c98d08fa0888db..db5e80ccdbaaba 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -327,11 +327,9 @@ class TargetLibraryInfo { bool AllowCallerSuperset) const { if (!AllowCallerSuperset) return OverrideAsUnavailable == CalleeTLI.OverrideAsUnavailable; - BitVector B = OverrideAsUnavailable; - B |= CalleeTLI.OverrideAsUnavailable; - // We can inline if the union of the caller and callee's nobuiltin - // attributes is no stricter than the caller's nobuiltin attributes. - return B == OverrideAsUnavailable; + // We can inline if the callee's nobuiltin attributes are no stricter than + // the caller's. + return !CalleeTLI.OverrideAsUnavailable.test(OverrideAsUnavailable); } /// Return true if the function type FTy is valid for the library function diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 3c9495c1b372ce..35283637027db8 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -255,6 +255,23 @@ void processShuffleMasks( function_ref, unsigned, unsigned)> SingleInputAction, function_ref, unsigned, unsigned)> ManyInputsAction); +/// Compute the demanded elements mask of horizontal binary operations. A +/// horizontal operation combines two adjacent elements in a vector operand. +/// This function returns a mask for the elements that correspond to the first +/// operand of this horizontal combination. For example, for two vectors +/// [X1, X2, X3, X4] and [Y1, Y2, Y3, Y4], the resulting mask can include the +/// elements X1, X3, Y1, and Y3. To get the other operands, simply shift the +/// result of this function to the left by 1. +/// +/// \param VectorBitWidth the total bit width of the vector +/// \param DemandedElts the demanded elements mask for the operation +/// \param DemandedLHS the demanded elements mask for the left operand +/// \param DemandedRHS the demanded elements mask for the right operand +void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, + const APInt &DemandedElts, + APInt &DemandedLHS, + APInt &DemandedRHS); + /// Compute a map of integer instructions to their minimum legal type /// size. /// diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 284f434fbb9b0c..6e2ab8ce403387 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -22,7 +22,7 @@ #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/TargetOpcodes.h" namespace llvm { @@ -373,6 +373,8 @@ class LegalizerHelper { /// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT. LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, LLT CastTy); + LegalizeResult bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy); LegalizeResult lowerConstant(MachineInstr &MI); LegalizeResult lowerFConstant(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h index 2b177e6763d39e..d4edacedb88e6d 100644 --- a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h +++ b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h @@ -16,6 +16,7 @@ #define LLVM_CODEGEN_MACHINEOPTIMIZATIONREMARKEMITTER_H #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include @@ -155,6 +156,13 @@ class MachineOptimizationRemarkEmitter { MachineBlockFrequencyInfo *MBFI) : MF(MF), MBFI(MBFI) {} + MachineOptimizationRemarkEmitter(MachineOptimizationRemarkEmitter &&) = + default; + + /// Handle invalidation events in the new pass manager. + bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv); + /// Emit an optimization remark. void emit(DiagnosticInfoOptimizationBase &OptDiag); @@ -212,6 +220,17 @@ class MachineOptimizationRemarkEmitter { bool shouldEmitVerbose() { return MBFI != nullptr; } }; +/// The analysis pass +class MachineOptimizationRemarkEmitterAnalysis + : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = MachineOptimizationRemarkEmitter; + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); +}; + /// The analysis pass /// /// Note that this pass shouldn't generally be marked as preserved by other diff --git a/llvm/include/llvm/CodeGen/MachineVerifier.h b/llvm/include/llvm/CodeGen/MachineVerifier.h new file mode 100644 index 00000000000000..bfd0681fb79545 --- /dev/null +++ b/llvm/include/llvm/CodeGen/MachineVerifier.h @@ -0,0 +1,28 @@ +//===- llvm/CodeGen/MachineVerifier.h - Machine Code Verifier ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINEVERIFIER_H +#define LLVM_CODEGEN_MACHINEVERIFIER_H + +#include "llvm/CodeGen/MachinePassManager.h" +#include + +namespace llvm { +class MachineVerifierPass : public PassInfoMixin { + std::string Banner; + +public: + MachineVerifierPass(const std::string &Banner = std::string()) + : Banner(Banner) {} + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_MACHINEVERIFIER_H diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h new file mode 100644 index 00000000000000..ce63dcc405fd5e --- /dev/null +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -0,0 +1,96 @@ +//===-- CodeGen/RuntimeLibcallUtil.h - Runtime Library Calls ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines some helper functions for runtime library calls. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_RUNTIMELIBCALLS_H +#define LLVM_CODEGEN_RUNTIMELIBCALLS_H + +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/Support/AtomicOrdering.h" + +namespace llvm { +namespace RTLIB { + +/// GetFPLibCall - Helper to return the right libcall for the given floating +/// point type, or UNKNOWN_LIBCALL if there is none. +Libcall getFPLibCall(EVT VT, Libcall Call_F32, Libcall Call_F64, + Libcall Call_F80, Libcall Call_F128, Libcall Call_PPCF128); + +/// getFPEXT - Return the FPEXT_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFPEXT(EVT OpVT, EVT RetVT); + +/// getFPROUND - Return the FPROUND_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFPROUND(EVT OpVT, EVT RetVT); + +/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFPTOSINT(EVT OpVT, EVT RetVT); + +/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFPTOUINT(EVT OpVT, EVT RetVT); + +/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getSINTTOFP(EVT OpVT, EVT RetVT); + +/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getUINTTOFP(EVT OpVT, EVT RetVT); + +/// getPOWI - Return the POWI_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getPOWI(EVT RetVT); + +/// getLDEXP - Return the LDEXP_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getLDEXP(EVT RetVT); + +/// getFREXP - Return the FREXP_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFREXP(EVT RetVT); + +/// Return the SYNC_FETCH_AND_* value for the given opcode and type, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getSYNC(unsigned Opc, MVT VT); + +/// Return the outline atomics value for the given atomic ordering, access +/// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there +/// is none. +Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order, + uint64_t MemSize); + +/// Return the outline atomics value for the given opcode, atomic ordering +/// and type, or UNKNOWN_LIBCALL if there is none. +Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT); + +/// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return +/// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or +/// UNKNOW_LIBCALL if there is none. +Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); + +/// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return +/// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or +/// UNKNOW_LIBCALL if there is none. +Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); + +/// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return +/// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or +/// UNKNOW_LIBCALL if there is none. +Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); + +} // namespace RTLIB +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h deleted file mode 100644 index 3a407c4a4d9406..00000000000000 --- a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h +++ /dev/null @@ -1,113 +0,0 @@ -//===-- CodeGen/RuntimeLibcalls.h - Runtime Library Calls -------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the enum representing the list of runtime library calls -// the backend may emit during code generation, and also some helper functions. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_RUNTIMELIBCALLS_H -#define LLVM_CODEGEN_RUNTIMELIBCALLS_H - -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/Support/AtomicOrdering.h" - -namespace llvm { -namespace RTLIB { - /// RTLIB::Libcall enum - This enum defines all of the runtime library calls - /// the backend can emit. The various long double types cannot be merged, - /// because 80-bit library functions use "xf" and 128-bit use "tf". - /// - /// When adding PPCF128 functions here, note that their names generally need - /// to be overridden for Darwin with the xxx$LDBL128 form. See - /// PPCISelLowering.cpp. - /// - enum Libcall { -#define HANDLE_LIBCALL(code, name) code, - #include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL - }; - - /// GetFPLibCall - Helper to return the right libcall for the given floating - /// point type, or UNKNOWN_LIBCALL if there is none. - Libcall getFPLibCall(EVT VT, - Libcall Call_F32, - Libcall Call_F64, - Libcall Call_F80, - Libcall Call_F128, - Libcall Call_PPCF128); - - /// getFPEXT - Return the FPEXT_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getFPEXT(EVT OpVT, EVT RetVT); - - /// getFPROUND - Return the FPROUND_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getFPROUND(EVT OpVT, EVT RetVT); - - /// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getFPTOSINT(EVT OpVT, EVT RetVT); - - /// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getFPTOUINT(EVT OpVT, EVT RetVT); - - /// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getSINTTOFP(EVT OpVT, EVT RetVT); - - /// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getUINTTOFP(EVT OpVT, EVT RetVT); - - /// getPOWI - Return the POWI_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getPOWI(EVT RetVT); - - /// getLDEXP - Return the LDEXP_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getLDEXP(EVT RetVT); - - /// getFREXP - Return the FREXP_* value for the given types, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getFREXP(EVT RetVT); - - /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or - /// UNKNOWN_LIBCALL if there is none. - Libcall getSYNC(unsigned Opc, MVT VT); - - /// Return the outline atomics value for the given atomic ordering, access - /// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there - /// is none. - Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4], - AtomicOrdering Order, uint64_t MemSize); - - /// Return the outline atomics value for the given opcode, atomic ordering - /// and type, or UNKNOWN_LIBCALL if there is none. - Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT); - - /// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return - /// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or - /// UNKNOW_LIBCALL if there is none. - Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); - - /// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return - /// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or - /// UNKNOW_LIBCALL if there is none. - Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); - - /// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return - /// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or - /// UNKNOW_LIBCALL if there is none. - Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); - -} -} - -#endif diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index f39fbd95b3beb7..a905c85f56b668 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -447,6 +447,49 @@ template <> struct EffectiveOperands { explicit EffectiveOperands(SDValue N) : Size(N->getNumOperands()) {} }; +// === Ternary operations === +template +struct TernaryOpc_match { + unsigned Opcode; + T0_P Op0; + T1_P Op1; + T2_P Op2; + + TernaryOpc_match(unsigned Opc, const T0_P &Op0, const T1_P &Op1, + const T2_P &Op2) + : Opcode(Opc), Op0(Op0), Op1(Op1), Op2(Op2) {} + + template + bool match(const MatchContext &Ctx, SDValue N) { + if (sd_context_match(N, Ctx, m_Opc(Opcode))) { + EffectiveOperands EO(N); + assert(EO.Size == 3); + return ((Op0.match(Ctx, N->getOperand(EO.FirstIndex)) && + Op1.match(Ctx, N->getOperand(EO.FirstIndex + 1))) || + (Commutable && Op0.match(Ctx, N->getOperand(EO.FirstIndex + 1)) && + Op1.match(Ctx, N->getOperand(EO.FirstIndex)))) && + Op2.match(Ctx, N->getOperand(EO.FirstIndex + 2)); + } + + return false; + } +}; + +template +inline TernaryOpc_match +m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) { + return TernaryOpc_match(ISD::SETCC, LHS, RHS, + CC); +} + +template +inline TernaryOpc_match +m_c_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) { + return TernaryOpc_match(ISD::SETCC, LHS, RHS, + CC); +} + // === Binary operations === template diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 55b60b01e58277..ef66b82d6f4148 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -31,7 +31,7 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -45,6 +45,7 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/IR/Type.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/AtomicOrdering.h" @@ -3390,6 +3391,10 @@ class TargetLoweringBase { return isOperationLegalOrCustom(Op, VT); } + /// Should we expand [US]CMP nodes using two selects and two compares, or by + /// doing arithmetic on boolean types + virtual bool shouldExpandCmpUsingSelects() const { return false; } + /// Does this target support complex deinterleaving virtual bool isComplexDeinterleavingSupported() const { return false; } @@ -3410,44 +3415,40 @@ class TargetLoweringBase { return nullptr; } - //===--------------------------------------------------------------------===// - // Runtime Library hooks - // - /// Rename the default libcall routine name for the specified libcall. void setLibcallName(RTLIB::Libcall Call, const char *Name) { - LibcallRoutineNames[Call] = Name; + Libcalls.setLibcallName(Call, Name); } + void setLibcallName(ArrayRef Calls, const char *Name) { - for (auto Call : Calls) - setLibcallName(Call, Name); + Libcalls.setLibcallName(Calls, Name); } /// Get the libcall routine name for the specified libcall. const char *getLibcallName(RTLIB::Libcall Call) const { - return LibcallRoutineNames[Call]; + return Libcalls.getLibcallName(Call); } /// Override the default CondCode to be used to test the result of the /// comparison libcall against zero. void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { - CmpLibcallCCs[Call] = CC; + Libcalls.setCmpLibcallCC(Call, CC); } /// Get the CondCode that's to be used to test the result of the comparison /// libcall against zero. ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { - return CmpLibcallCCs[Call]; + return Libcalls.getCmpLibcallCC(Call); } /// Set the CallingConv that should be used for the specified libcall. void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { - LibcallCallingConvs[Call] = CC; + Libcalls.setLibcallCallingConv(Call, CC); } /// Get the CallingConv that should be used for the specified libcall. CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { - return LibcallCallingConvs[Call]; + return Libcalls.getLibcallCallingConv(Call); } /// Execute target specific actions to finalize target lowering. @@ -3626,18 +3627,8 @@ class TargetLoweringBase { std::map, MVT::SimpleValueType> PromoteToType; - /// Stores the name each libcall. - const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; - - /// The ISD::CondCode that should be used to test the result of each of the - /// comparison libcall against zero. - ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL]; - - /// Stores the CallingConv that should be used for each libcall. - CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; - - /// Set default libcall names and calling conventions. - void InitLibcalls(const Triple &TT); + /// The list of libcalls that the target will use. + RTLIB::RuntimeLibcallsInfo Libcalls; /// The bits of IndexedModeActions used to store the legalisation actions /// We store the data as | ML | MS | L | S | each taking 4 bits. diff --git a/llvm/include/llvm/CodeGen/TwoAddressInstructionPass.h b/llvm/include/llvm/CodeGen/TwoAddressInstructionPass.h new file mode 100644 index 00000000000000..7f2a070c584347 --- /dev/null +++ b/llvm/include/llvm/CodeGen/TwoAddressInstructionPass.h @@ -0,0 +1,29 @@ +//===- llvm/CodeGen/TwoAddressInstructionPass.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_TWOADDRESSINSTRUCTIONPASS_H +#define LLVM_CODEGEN_TWOADDRESSINSTRUCTIONPASS_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class TwoAddressInstructionPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + MachineFunctionProperties getSetProperties() { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::TiedOpsRewritten); + } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_TWOADDRESSINSTRUCTIONPASS_H diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h index c7c558850a2805..a9a3c7edde691e 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h @@ -497,7 +497,7 @@ class CFIProgram { /// Types of operands to CFI instructions /// In DWARF, this type is implicitly tied to a CFI instruction opcode and - /// thus this type doesn't need to be explictly written to the file (this is + /// thus this type doesn't need to be explicitly written to the file (this is /// not a DWARF encoding). The relationship of instrs to operand types can /// be obtained from getOperandTypes() and is only used to simplify /// instruction printing. diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h index dbb658940eef12..563887d1149a8c 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h @@ -107,12 +107,10 @@ class DWARFFormValue { /// getAsFoo functions below return the extracted value as Foo if only /// DWARFFormValue has form class is suitable for representing Foo. - std::optional getAsReference() const; - struct UnitOffset { - DWARFUnit *Unit; - uint64_t Offset; - }; - std::optional getAsRelativeReference() const; + std::optional getAsRelativeReference() const; + std::optional getAsDebugInfoReference() const; + std::optional getAsSignatureReference() const; + std::optional getAsSupplementaryReference() const; std::optional getAsUnsignedConstant() const; std::optional getAsSignedConstant() const; Expected getAsCString() const; @@ -242,27 +240,102 @@ inline uint64_t toUnsigned(const std::optional &V, return toUnsigned(V).value_or(Default); } -/// Take an optional DWARFFormValue and try to extract an reference. +/// Take an optional DWARFFormValue and try to extract a relative offset +/// reference. /// -/// \param V and optional DWARFFormValue to attempt to extract the value from. +/// \param V an optional DWARFFormValue to attempt to extract the value from. /// \returns an optional value that contains a value if the form value -/// was valid and has a reference form. +/// was valid and has a relative reference form. inline std::optional -toReference(const std::optional &V) { +toRelativeReference(const std::optional &V) { if (V) - return V->getAsReference(); + return V->getAsRelativeReference(); return std::nullopt; } -/// Take an optional DWARFFormValue and extract a reference. +/// Take an optional DWARFFormValue and extract a relative offset reference. /// -/// \param V and optional DWARFFormValue to attempt to extract the value from. +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \param Default the default value to return in case of failure. +/// \returns the extracted reference value or Default if the V doesn't have a +/// value or the form value's encoding wasn't a relative offset reference form. +inline uint64_t toRelativeReference(const std::optional &V, + uint64_t Default) { + return toRelativeReference(V).value_or(Default); +} + +/// Take an optional DWARFFormValue and try to extract an absolute debug info +/// offset reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \returns an optional value that contains a value if the form value +/// was valid and has an (absolute) debug info offset reference form. +inline std::optional +toDebugInfoReference(const std::optional &V) { + if (V) + return V->getAsDebugInfoReference(); + return std::nullopt; +} + +/// Take an optional DWARFFormValue and extract an absolute debug info offset +/// reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \param Default the default value to return in case of failure. +/// \returns the extracted reference value or Default if the V doesn't have a +/// value or the form value's encoding wasn't an absolute debug info offset +/// reference form. +inline uint64_t toDebugInfoReference(const std::optional &V, + uint64_t Default) { + return toDebugInfoReference(V).value_or(Default); +} + +/// Take an optional DWARFFormValue and try to extract a signature reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \returns an optional value that contains a value if the form value +/// was valid and has a signature reference form. +inline std::optional +toSignatureReference(const std::optional &V) { + if (V) + return V->getAsSignatureReference(); + return std::nullopt; +} + +/// Take an optional DWARFFormValue and extract a signature reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \param Default the default value to return in case of failure. +/// \returns the extracted reference value or Default if the V doesn't have a +/// value or the form value's encoding wasn't a signature reference form. +inline uint64_t toSignatureReference(const std::optional &V, + uint64_t Default) { + return toSignatureReference(V).value_or(Default); +} + +/// Take an optional DWARFFormValue and try to extract a supplementary debug +/// info reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. +/// \returns an optional value that contains a value if the form value +/// was valid and has a supplementary reference form. +inline std::optional +toSupplementaryReference(const std::optional &V) { + if (V) + return V->getAsSupplementaryReference(); + return std::nullopt; +} + +/// Take an optional DWARFFormValue and extract a supplementary debug info +/// reference. +/// +/// \param V an optional DWARFFormValue to attempt to extract the value from. /// \param Default the default value to return in case of failure. /// \returns the extracted reference value or Default if the V doesn't have a -/// value or the form value's encoding wasn't a reference form. -inline uint64_t toReference(const std::optional &V, - uint64_t Default) { - return toReference(V).value_or(Default); +/// value or the form value's encoding wasn't a supplementary reference form. +inline uint64_t toSupplementaryReference(const std::optional &V, + uint64_t Default) { + return toSupplementaryReference(V).value_or(Default); } /// Take an optional DWARFFormValue and try to extract an signed constant. diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangle.h b/llvm/include/llvm/Demangle/MicrosoftDemangle.h index 1529b803debe5a..6891185a28e57f 100644 --- a/llvm/include/llvm/Demangle/MicrosoftDemangle.h +++ b/llvm/include/llvm/Demangle/MicrosoftDemangle.h @@ -54,6 +54,10 @@ class ArenaAllocator { } } + // Delete the copy constructor and the copy assignment operator. + ArenaAllocator(const ArenaAllocator &) = delete; + ArenaAllocator &operator=(const ArenaAllocator &) = delete; + char *allocUnalignedBuffer(size_t Size) { assert(Head && Head->Buf); diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index 7a9b95f23465cd..c27572300d5063 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -433,17 +433,7 @@ class Instruction : public User, /// convenience method for passes to do so. /// dropUBImplyingAttrsAndUnknownMetadata should be used instead of /// this API if the Instruction being modified is a call. - void dropUnknownNonDebugMetadata(ArrayRef KnownIDs); - void dropUnknownNonDebugMetadata() { - return dropUnknownNonDebugMetadata(std::nullopt); - } - void dropUnknownNonDebugMetadata(unsigned ID1) { - return dropUnknownNonDebugMetadata(ArrayRef(ID1)); - } - void dropUnknownNonDebugMetadata(unsigned ID1, unsigned ID2) { - unsigned IDs[] = {ID1, ID2}; - return dropUnknownNonDebugMetadata(IDs); - } + void dropUnknownNonDebugMetadata(ArrayRef KnownIDs = std::nullopt); /// @} /// Adds an !annotation metadata node with \p Annotation to this instruction. diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index a2ecf625ff61aa..fe3f92da400f8a 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1296,9 +1296,6 @@ class MemMoveInst : public MemTransferInst { /// This class wraps the llvm.memcpy.inline intrinsic. class MemCpyInlineInst : public MemCpyInst { public: - ConstantInt *getLength() const { - return cast(MemCpyInst::getLength()); - } // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::memcpy_inline; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 01e379dfcebcad..9d04256d593178 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -966,7 +966,6 @@ def int_memcpy : Intrinsic<[], // Memcpy semantic that is guaranteed to be inlined. // In particular this means that the generated code is not allowed to call any // external function. -// The third argument (specifying the size) must be a constant. def int_memcpy_inline : Intrinsic<[], [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i1_ty], @@ -974,7 +973,7 @@ def int_memcpy_inline NoCapture>, NoCapture>, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>, - ImmArg>, ImmArg>]>; + ImmArg>]>; def int_memmove : Intrinsic<[], [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td index 8b4f4966fbd9aa..38263f375c4692 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td @@ -30,6 +30,18 @@ class ScalarCoreVAluGprGprGprIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; +class ScalarCoreVMacGprGprGprIntrinsic + : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrWillReturn, IntrSpeculatable]>; + +class ScalarCoreVMacGprGPRImmIntrinsic + : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrWillReturn, IntrSpeculatable, ImmArg>]>; + +class ScalarCoreVMacGprGprGprImmIntrinsic + : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrWillReturn, IntrSpeculatable, ImmArg>]>; + let TargetPrefix = "riscv" in { def int_riscv_cv_bitmanip_extract : ScalarCoreVBitManipGprGprIntrinsic; def int_riscv_cv_bitmanip_extractu : ScalarCoreVBitManipGprGprIntrinsic; @@ -57,4 +69,25 @@ let TargetPrefix = "riscv" in { def int_riscv_cv_alu_subun : ScalarCoreVAluGprGprGprIntrinsic; def int_riscv_cv_alu_subrn : ScalarCoreVAluGprGprGprIntrinsic; def int_riscv_cv_alu_suburn : ScalarCoreVAluGprGprGprIntrinsic; + + def int_riscv_cv_mac_mac : ScalarCoreVMacGprGprGprIntrinsic; + def int_riscv_cv_mac_msu : ScalarCoreVMacGprGprGprIntrinsic; + + def int_riscv_cv_mac_muluN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulhhuN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulsN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulhhsN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_muluRN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulhhuRN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulsRN : ScalarCoreVMacGprGPRImmIntrinsic; + def int_riscv_cv_mac_mulhhsRN : ScalarCoreVMacGprGPRImmIntrinsic; + + def int_riscv_cv_mac_macuN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_machhuN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_macsN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_machhsN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_macuRN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_machhuRN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_macsRN : ScalarCoreVMacGprGprGprImmIntrinsic; + def int_riscv_cv_mac_machhsRN : ScalarCoreVMacGprGprGprImmIntrinsic; } // TargetPrefix = "riscv" diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index aee804047e1b06..adc46f9789ebb6 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -129,83 +129,6 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty], []>; } -//===----------------------------------------------------------------------===// -// 3DNow! - -let TargetPrefix = "x86" in { - def int_x86_3dnow_pavgusb : ClangBuiltin<"__builtin_ia32_pavgusb">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pf2id : ClangBuiltin<"__builtin_ia32_pf2id">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnow_pfacc : ClangBuiltin<"__builtin_ia32_pfacc">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfadd : ClangBuiltin<"__builtin_ia32_pfadd">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfcmpeq : ClangBuiltin<"__builtin_ia32_pfcmpeq">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfcmpge : ClangBuiltin<"__builtin_ia32_pfcmpge">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfcmpgt : ClangBuiltin<"__builtin_ia32_pfcmpgt">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfmax : ClangBuiltin<"__builtin_ia32_pfmax">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfmin : ClangBuiltin<"__builtin_ia32_pfmin">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfmul : ClangBuiltin<"__builtin_ia32_pfmul">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfrcp : ClangBuiltin<"__builtin_ia32_pfrcp">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnow_pfrcpit1 : ClangBuiltin<"__builtin_ia32_pfrcpit1">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfrcpit2 : ClangBuiltin<"__builtin_ia32_pfrcpit2">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfrsqrt : ClangBuiltin<"__builtin_ia32_pfrsqrt">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnow_pfrsqit1 : ClangBuiltin<"__builtin_ia32_pfrsqit1">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfsub : ClangBuiltin<"__builtin_ia32_pfsub">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pfsubr : ClangBuiltin<"__builtin_ia32_pfsubr">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnow_pi2fd : ClangBuiltin<"__builtin_ia32_pi2fd">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnow_pmulhrw : ClangBuiltin<"__builtin_ia32_pmulhrw">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; -} - -//===----------------------------------------------------------------------===// -// 3DNow! extensions - -let TargetPrefix = "x86" in { - def int_x86_3dnowa_pf2iw : ClangBuiltin<"__builtin_ia32_pf2iw">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnowa_pfnacc : ClangBuiltin<"__builtin_ia32_pfnacc">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnowa_pfpnacc : ClangBuiltin<"__builtin_ia32_pfpnacc">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], - [IntrNoMem]>; - def int_x86_3dnowa_pi2fw : ClangBuiltin<"__builtin_ia32_pi2fw">, - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_3dnowa_pswapd : - DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; -} - //===----------------------------------------------------------------------===// // SSE1 @@ -2332,8 +2255,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_emms : ClangBuiltin<"__builtin_ia32_emms">, Intrinsic<[], [], []>; - def int_x86_mmx_femms : ClangBuiltin<"__builtin_ia32_femms">, - Intrinsic<[], [], []>; } // Integer arithmetic ops. diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h new file mode 100644 index 00000000000000..3057bff397b2fb --- /dev/null +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -0,0 +1,126 @@ +//===- RuntimeLibcalls.h - Interface for runtime libcalls -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a common interface to work with library calls into a +// runtime that may be emitted by a given backend. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_RUNTIME_LIBCALLS_H +#define LLVM_IR_RUNTIME_LIBCALLS_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/TargetParser/Triple.h" + +namespace llvm { +namespace RTLIB { + +/// RTLIB::Libcall enum - This enum defines all of the runtime library calls +/// the backend can emit. The various long double types cannot be merged, +/// because 80-bit library functions use "xf" and 128-bit use "tf". +/// +/// When adding PPCF128 functions here, note that their names generally need +/// to be overridden for Darwin with the xxx$LDBL128 form. See +/// PPCISelLowering.cpp. +/// +enum Libcall { +#define HANDLE_LIBCALL(code, name) code, +#include "llvm/IR/RuntimeLibcalls.def" +#undef HANDLE_LIBCALL +}; + +/// A simple container for information about the supported runtime calls. +struct RuntimeLibcallsInfo { + explicit RuntimeLibcallsInfo(const Triple &TT) { + initLibcalls(TT); + initCmpLibcallCCs(); + } + + /// Rename the default libcall routine name for the specified libcall. + void setLibcallName(RTLIB::Libcall Call, const char *Name) { + LibcallRoutineNames[Call] = Name; + } + + void setLibcallName(ArrayRef Calls, const char *Name) { + for (auto Call : Calls) + setLibcallName(Call, Name); + } + + /// Get the libcall routine name for the specified libcall. + const char *getLibcallName(RTLIB::Libcall Call) const { + return LibcallRoutineNames[Call]; + } + + /// Override the default CondCode to be used to test the result of the + /// comparison libcall against zero. + void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { + CmpLibcallCCs[Call] = CC; + } + + /// Get the CondCode that's to be used to test the result of the comparison + /// libcall against zero. + ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { + return CmpLibcallCCs[Call]; + } + + /// Set the CallingConv that should be used for the specified libcall. + void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { + LibcallCallingConvs[Call] = CC; + } + + /// Get the CallingConv that should be used for the specified libcall. + CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { + return LibcallCallingConvs[Call]; + } + + iterator_range getLibcallNames() { + return llvm::make_range(LibcallRoutineNames, + LibcallRoutineNames + RTLIB::UNKNOWN_LIBCALL); + } + +private: + /// Stores the name each libcall. + const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; + + /// The ISD::CondCode that should be used to test the result of each of the + /// comparison libcall against zero. + ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL]; + + /// Stores the CallingConv that should be used for each libcall. + CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; + + static bool darwinHasSinCos(const Triple &TT) { + assert(TT.isOSDarwin() && "should be called with darwin triple"); + // Don't bother with 32 bit x86. + if (TT.getArch() == Triple::x86) + return false; + // Macos < 10.9 has no sincos_stret. + if (TT.isMacOSX()) + return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit(); + // iOS < 7.0 has no sincos_stret. + if (TT.isiOS()) + return !TT.isOSVersionLT(7, 0); + // Any other darwin such as WatchOS/TvOS is new enough. + return true; + } + + /// Sets default libcall calling conventions. + void initCmpLibcallCCs(); + + /// Set default libcall names. If a target wants to opt-out of a libcall it + /// should be placed here. + void initLibcalls(const Triple &TT); +}; + +} // namespace RTLIB +} // namespace llvm + +#endif // LLVM_IR_RUNTIME_LIBCALLS_H diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h index 301edaed70fe88..6af7f6075551dc 100644 --- a/llvm/include/llvm/IR/VectorBuilder.h +++ b/llvm/include/llvm/IR/VectorBuilder.h @@ -15,6 +15,7 @@ #ifndef LLVM_IR_VECTORBUILDER_H #define LLVM_IR_VECTORBUILDER_H +#include #include #include #include @@ -57,6 +58,11 @@ class VectorBuilder { return RetType(); } + /// Helper function for creating VP intrinsic call. + Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); + public: VectorBuilder(IRBuilderBase &Builder, Behavior ErrorHandling = Behavior::ReportAndAbort) @@ -92,6 +98,15 @@ class VectorBuilder { Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef VecOpArray, const Twine &Name = Twine()); + + /// Emit a VP reduction intrinsic call for recurrence kind. + /// \param Kind The kind of recurrence + /// \param ValTy The type of operand which the reduction operation is + /// performed. + /// \param VecOpArray The operand list. + Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy, + ArrayRef VecOpArray, + const Twine &Name = Twine()); }; } // namespace llvm diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 8979fcd95aa9a6..13be9c11f01072 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -208,7 +208,7 @@ void initializeMachineSinkingPass(PassRegistry&); void initializeMachineTraceMetricsPass(PassRegistry&); void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &); void initializeMachineUniformityAnalysisPassPass(PassRegistry &); -void initializeMachineVerifierPassPass(PassRegistry&); +void initializeMachineVerifierLegacyPassPass(PassRegistry &); void initializeMemoryDependenceWrapperPassPass(PassRegistry&); void initializeMemorySSAWrapperPassPass(PassRegistry&); void initializeMergeICmpsLegacyPassPass(PassRegistry &); @@ -298,7 +298,7 @@ void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); void initializeTLSVariableHoistLegacyPassPass(PassRegistry &); -void initializeTwoAddressInstructionPassPass(PassRegistry&); +void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeTypePromotionLegacyPass(PassRegistry&); void initializeInitUndefPass(PassRegistry &); diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 94996ae89e35d0..30eda34cd7ba54 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -301,7 +301,7 @@ class LTO { /// Static method that returns a list of libcall symbols that can be generated /// by LTO but might not be visible from bitcode symbol table. - static ArrayRef getRuntimeLibcallSymbols(); + static SmallVector getRuntimeLibcallSymbols(const Triple &TT); private: Config Conf; diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h index 98317712250bf9..899299fd15246a 100644 --- a/llvm/include/llvm/MC/MCTargetOptions.h +++ b/llvm/include/llvm/MC/MCTargetOptions.h @@ -68,6 +68,8 @@ class MCTargetOptions { // ELF. bool X86RelaxRelocations = true; + bool X86Sse2Avx = false; + EmitDwarfUnwindType EmitDwarfUnwind; int DwarfVersion = 0; diff --git a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h index c419f2998a3834..9d592446f3ba77 100644 --- a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h +++ b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h @@ -55,6 +55,8 @@ bool getCrel(); bool getX86RelaxRelocations(); +bool getX86Sse2Avx(); + std::string getABIName(); std::string getAsSecureLogFile(); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 81900510c9ae79..5b8e69b602e2b1 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -52,6 +52,7 @@ #include "llvm/CodeGen/SjLjEHPrepare.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TwoAddressInstructionPass.h" #include "llvm/CodeGen/UnreachableBlockElim.h" #include "llvm/CodeGen/WasmEHPrepare.h" #include "llvm/CodeGen/WinEHPrepare.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index d5cd8d4a132fca..a47d7494f2eefc 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -101,6 +101,8 @@ MACHINE_FUNCTION_ANALYSIS("machine-branch-prob", MachineBranchProbabilityAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-dom-tree", MachineDominatorTreeAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-loops", MachineLoopAnalysis()) +MACHINE_FUNCTION_ANALYSIS("machine-opt-remark-emitter", + MachineOptimizationRemarkEmitterAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis()) MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) @@ -112,8 +114,6 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) // MACHINE_FUNCTION_ANALYSIS("machine-loops", MachineLoopInfoAnalysis()) // MACHINE_FUNCTION_ANALYSIS("machine-dom-frontier", // MachineDominanceFrontierAnalysis()) -// MACHINE_FUNCTION_ANALYSIS("machine-ore", -// MachineOptimizationRemarkEmitterPassAnalysis()) // MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", // MachinePostDominatorTreeAnalysis()) // MACHINE_FUNCTION_ANALYSIS("machine-region-info", @@ -148,6 +148,8 @@ MACHINE_FUNCTION_PASS("print", SlotIndexesPrinterPass(dbgs())) MACHINE_FUNCTION_PASS("require-all-machine-function-properties", RequireAllMachineFunctionPropertiesPass()) MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) +MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass()) +MACHINE_FUNCTION_PASS("verify", MachineVerifierPass()) #undef MACHINE_FUNCTION_PASS #ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS @@ -257,7 +259,6 @@ DUMMY_MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass) DUMMY_MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass) DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass) DUMMY_MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass) -DUMMY_MACHINE_FUNCTION_PASS("twoaddressinstruction", TwoAddressInstructionPass) DUMMY_MACHINE_FUNCTION_PASS("unpack-mi-bundles", UnpackMachineBundlesPass) DUMMY_MACHINE_FUNCTION_PASS("virtregrewriter", VirtRegRewriterPass) DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass) diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index 84d1b541171bf1..fa9c744294a666 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -461,7 +461,8 @@ class VerifyInstrumentation { public: VerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {} - void registerCallbacks(PassInstrumentationCallbacks &PIC); + void registerCallbacks(PassInstrumentationCallbacks &PIC, + ModuleAnalysisManager *MAM); }; /// This class implements --time-trace functionality for new pass manager. diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 317884fe076810..fcb581211736ee 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -55,12 +55,13 @@ // } // namespace sandboxir // -#ifndef LLVM_TRANSFORMS_SANDBOXIR_SANDBOXIR_H -#define LLVM_TRANSFORMS_SANDBOXIR_SANDBOXIR_H +#ifndef LLVM_SANDBOXIR_SANDBOXIR_H +#define LLVM_SANDBOXIR_SANDBOXIR_H #include "llvm/IR/Function.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/SandboxIR/Use.h" #include "llvm/Support/raw_ostream.h" #include @@ -68,48 +69,13 @@ namespace llvm { namespace sandboxir { -class Function; +class BasicBlock; class Context; +class Function; class Instruction; class User; class Value; -/// Represents a Def-use/Use-def edge in SandboxIR. -/// NOTE: Unlike llvm::Use, this is not an integral part of the use-def chains. -/// It is also not uniqued and is currently passed by value, so you can have -/// more than one sandboxir::Use objects for the same use-def edge. -class Use { - llvm::Use *LLVMUse; - User *Usr; - Context *Ctx; - - /// Don't allow the user to create a sandboxir::Use directly. - Use(llvm::Use *LLVMUse, User *Usr, Context &Ctx) - : LLVMUse(LLVMUse), Usr(Usr), Ctx(&Ctx) {} - Use() : LLVMUse(nullptr), Ctx(nullptr) {} - - friend class Value; // For constructor - friend class User; // For constructor - friend class OperandUseIterator; // For constructor - friend class UserUseIterator; // For accessing members - -public: - operator Value *() const { return get(); } - Value *get() const; - class User *getUser() const { return Usr; } - unsigned getOperandNo() const; - Context *getContext() const { return Ctx; } - bool operator==(const Use &Other) const { - assert(Ctx == Other.Ctx && "Contexts differ!"); - return LLVMUse == Other.LLVMUse && Usr == Other.Usr; - } - bool operator!=(const Use &Other) const { return !(*this == Other); } -#ifndef NDEBUG - void dump(raw_ostream &OS) const; - void dump() const; -#endif // NDEBUG -}; - /// Returns the operand edge when dereferenced. class OperandUseIterator { sandboxir::Use Use; @@ -508,6 +474,14 @@ class Instruction : public sandboxir::User { Opcode Opc; + /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This + /// returns its topmost LLVM IR instruction. + llvm::Instruction *getTopmostLLVMInstruction() const; + + /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program + /// order. + virtual SmallVector getLLVMInstrs() const = 0; + public: static const char *getOpcodeName(Opcode Opc); #ifndef NDEBUG @@ -518,6 +492,40 @@ class Instruction : public sandboxir::User { #endif /// This is used by BasicBlock::iterator. virtual unsigned getNumOfIRInstrs() const = 0; + /// \Returns a BasicBlock::iterator for this Instruction. + BBIterator getIterator() const; + /// \Returns the next sandboxir::Instruction in the block, or nullptr if at + /// the end of the block. + Instruction *getNextNode() const; + /// \Returns the previous sandboxir::Instruction in the block, or nullptr if + /// at the beginning of the block. + Instruction *getPrevNode() const; + /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode + /// state to allow for new SandboxIR-specific instructions. + Opcode getOpcode() const { return Opc; } + /// Detach this from its parent BasicBlock without deleting it. + void removeFromParent(); + /// Detach this Value from its parent and delete it. + void eraseFromParent(); + /// Insert this detached instruction before \p BeforeI. + void insertBefore(Instruction *BeforeI); + /// Insert this detached instruction after \p AfterI. + void insertAfter(Instruction *AfterI); + /// Insert this detached instruction into \p BB at \p WhereIt. + void insertInto(BasicBlock *BB, const BBIterator &WhereIt); + /// Move this instruction to \p WhereIt. + void moveBefore(BasicBlock &BB, const BBIterator &WhereIt); + /// Move this instruction before \p Before. + void moveBefore(Instruction *Before) { + moveBefore(*Before->getParent(), Before->getIterator()); + } + /// Move this instruction after \p After. + void moveAfter(Instruction *After) { + moveBefore(*After->getParent(), std::next(After->getIterator())); + } + /// \Returns the BasicBlock containing this Instruction, or null if it is + /// detached. + BasicBlock *getParent() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From); @@ -543,6 +551,9 @@ class OpaqueInst : public sandboxir::Instruction { Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final { return getOperandUseDefault(OpIdx, Verify); } + SmallVector getLLVMInstrs() const final { + return {cast(Val)}; + } public: static bool classof(const sandboxir::Value *From) { @@ -570,7 +581,8 @@ class BasicBlock : public Value { /// Builds a graph that contains all values in \p BB in their original form /// i.e., no vectorization is taking place here. void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB); - friend class Context; // For `buildBasicBlockFromIR` + friend class Context; // For `buildBasicBlockFromIR` + friend class Instruction; // For LLVM Val. BasicBlock(llvm::BasicBlock *BB, Context &SBCtx) : Value(ClassID::Block, BB, SBCtx) { @@ -623,6 +635,12 @@ class Context { DenseMap> LLVMValueToValueMap; + /// Remove \p V from the maps and returns the unique_ptr. + std::unique_ptr detachLLVMValue(llvm::Value *V); + /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively + /// detaches \p V from the underlying IR. + std::unique_ptr detach(Value *V); + friend void Instruction::eraseFromParent(); // For detach(). /// Take ownership of VPtr and store it in `LLVMValueToValueMap`. Value *registerValue(std::unique_ptr &&VPtr); @@ -711,4 +729,4 @@ class Function : public sandboxir::Value { } // namespace sandboxir } // namespace llvm -#endif // LLVM_TRANSFORMS_SANDBOXIR_SANDBOXIR_H +#endif // LLVM_SANDBOXIR_SANDBOXIR_H diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h new file mode 100644 index 00000000000000..33afb54c1ff297 --- /dev/null +++ b/llvm/include/llvm/SandboxIR/Use.h @@ -0,0 +1,63 @@ +//===- Use.h ----------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Sandbox IR Use. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SANDBOXIR_USE_H +#define LLVM_SANDBOXIR_USE_H + +#include "llvm/IR/Use.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm::sandboxir { + +class Context; +class Value; +class User; + +/// Represents a Def-use/Use-def edge in SandboxIR. +/// NOTE: Unlike llvm::Use, this is not an integral part of the use-def chains. +/// It is also not uniqued and is currently passed by value, so you can have +/// more than one sandboxir::Use objects for the same use-def edge. +class Use { + llvm::Use *LLVMUse; + User *Usr; + Context *Ctx; + + /// Don't allow the user to create a sandboxir::Use directly. + Use(llvm::Use *LLVMUse, User *Usr, Context &Ctx) + : LLVMUse(LLVMUse), Usr(Usr), Ctx(&Ctx) {} + Use() : LLVMUse(nullptr), Ctx(nullptr) {} + + friend class Value; // For constructor + friend class User; // For constructor + friend class OperandUseIterator; // For constructor + friend class UserUseIterator; // For accessing members + +public: + operator Value *() const { return get(); } + Value *get() const; + class User *getUser() const { return Usr; } + unsigned getOperandNo() const; + Context *getContext() const { return Ctx; } + bool operator==(const Use &Other) const { + assert(Ctx == Other.Ctx && "Contexts differ!"); + return LLVMUse == Other.LLVMUse && Usr == Other.Usr; + } + bool operator!=(const Use &Other) const { return !(*this == Other); } +#ifndef NDEBUG + void dump(raw_ostream &OS) const; + void dump() const; +#endif // NDEBUG +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_SANDBOXIR_USE_H diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h index 78099ae0daeca6..d0bed4d5cf383c 100644 --- a/llvm/include/llvm/Support/DXILABI.h +++ b/llvm/include/llvm/Support/DXILABI.h @@ -94,6 +94,25 @@ enum class ElementType : uint32_t { PackedU8x32, }; +/// Metadata tags for extra resource properties. +enum class ExtPropTags : uint32_t { + ElementType = 0, + StructuredBufferStride = 1, + SamplerFeedbackKind = 2, + Atomic64Use = 3, +}; + +enum class SamplerType : uint32_t { + Default = 0, + Comparison = 1, + Mono = 2, // Note: Seems to be unused. +}; + +enum class SamplerFeedbackType : uint32_t { + MinMip = 0, + MipRegionUsed = 1, +}; + } // namespace dxil } // namespace llvm diff --git a/llvm/include/llvm/Support/GenericIteratedDominanceFrontier.h b/llvm/include/llvm/Support/GenericIteratedDominanceFrontier.h index 0dc58e37c821df..cb18d5b0c265ad 100644 --- a/llvm/include/llvm/Support/GenericIteratedDominanceFrontier.h +++ b/llvm/include/llvm/Support/GenericIteratedDominanceFrontier.h @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Support/GenericDomTree.h" #include @@ -37,9 +38,10 @@ namespace IDFCalculatorDetail { /// successors. template struct ChildrenGetterTy { using NodeRef = typename GraphTraits::NodeRef; - using ChildrenTy = SmallVector; + using ChildIteratorType = typename GraphTraits::ChildIteratorType; + using range = iterator_range; - ChildrenTy get(const NodeRef &N); + range get(const NodeRef &N); }; } // end of namespace IDFCalculatorDetail @@ -115,13 +117,12 @@ template class IDFCalculatorBase { namespace IDFCalculatorDetail { template -typename ChildrenGetterTy::ChildrenTy +typename ChildrenGetterTy::range ChildrenGetterTy::get(const NodeRef &N) { using OrderedNodeTy = typename IDFCalculatorBase::OrderedNodeTy; - auto Children = children(N); - return {Children.begin(), Children.end()}; + return children(N); } } // end of namespace IDFCalculatorDetail diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h index 5b1494efe7bdc0..7421dac2744b61 100644 --- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h +++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h @@ -35,7 +35,8 @@ bool parseTuneCPU(StringRef CPU, bool IsRV64); StringRef getMArchFromMcpu(StringRef CPU); void fillValidCPUArchList(SmallVectorImpl &Values, bool IsRV64); void fillValidTuneCPUArchList(SmallVectorImpl &Values, bool IsRV64); -bool hasFastUnalignedAccess(StringRef CPU); +bool hasFastScalarUnalignedAccess(StringRef CPU); +bool hasFastVectorUnalignedAccess(StringRef CPU); } // namespace RISCV diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index cb2be3bbd29f7b..b3bb354b38ff58 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -88,6 +88,8 @@ class Triple { xtensa, // Tensilica: Xtensa nvptx, // NVPTX: 32-bit nvptx64, // NVPTX: 64-bit + le32, // le32: generic little-endian 32-bit CPU (PNaCl) + le64, // le64: generic little-endian 64-bit CPU (PNaCl) amdil, // AMDIL amdil64, // AMDIL with 64-bit pointers hsail, // AMD HSAIL diff --git a/llvm/include/llvm/Transforms/Utils/DXILResource.h b/llvm/include/llvm/Transforms/Utils/DXILResource.h new file mode 100644 index 00000000000000..df01fb457977a0 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/DXILResource.h @@ -0,0 +1,191 @@ +//===- DXILResource.h - Tools to translate DXIL resources -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H +#define LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H + +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/DXILABI.h" + +namespace llvm { +namespace dxil { + +struct ResourceBinding { + uint32_t Space; + uint32_t LowerBound; + uint32_t Size; + + bool operator==(const ResourceBinding &RHS) const { + return std::tie(Space, LowerBound, Size) == + std::tie(RHS.Space, RHS.LowerBound, RHS.Size); + } + bool operator!=(const ResourceBinding &RHS) const { return !(*this == RHS); } +}; + +class ResourceInfo { + struct UAVInfo { + bool GloballyCoherent; + bool HasCounter; + bool IsROV; + + bool operator==(const UAVInfo &RHS) const { + return std::tie(GloballyCoherent, HasCounter, IsROV) == + std::tie(RHS.GloballyCoherent, RHS.HasCounter, RHS.IsROV); + } + bool operator!=(const UAVInfo &RHS) const { return !(*this == RHS); } + }; + + struct StructInfo { + uint32_t Stride; + Align Alignment; + + bool operator==(const StructInfo &RHS) const { + return std::tie(Stride, Alignment) == std::tie(RHS.Stride, RHS.Alignment); + } + bool operator!=(const StructInfo &RHS) const { return !(*this == RHS); } + }; + + struct TypedInfo { + dxil::ElementType ElementTy; + uint32_t ElementCount; + + bool operator==(const TypedInfo &RHS) const { + return std::tie(ElementTy, ElementCount) == + std::tie(RHS.ElementTy, RHS.ElementCount); + } + bool operator!=(const TypedInfo &RHS) const { return !(*this == RHS); } + }; + + struct MSInfo { + uint32_t Count; + + bool operator==(const MSInfo &RHS) const { return Count == RHS.Count; } + bool operator!=(const MSInfo &RHS) const { return !(*this == RHS); } + }; + + struct FeedbackInfo { + dxil::SamplerFeedbackType Type; + + bool operator==(const FeedbackInfo &RHS) const { return Type == RHS.Type; } + bool operator!=(const FeedbackInfo &RHS) const { return !(*this == RHS); } + }; + + // Universal properties. + Value *Symbol; + StringRef Name; + + ResourceBinding Binding; + uint32_t UniqueID; + + dxil::ResourceClass RC; + dxil::ResourceKind Kind; + + // Resource class dependent properties. + // CBuffer, Sampler, and RawBuffer end here. + union { + UAVInfo UAVFlags; // UAV + uint32_t CBufferSize; // CBuffer + dxil::SamplerType SamplerTy; // Sampler + }; + + // Resource kind dependent properties. + union { + StructInfo Struct; // StructuredBuffer + TypedInfo Typed; // All SRV/UAV except Raw/StructuredBuffer + FeedbackInfo Feedback; // FeedbackTexture + }; + + MSInfo MultiSample; + + // Conditions to check before accessing union members. + bool isUAV() const; + bool isCBuffer() const; + bool isSampler() const; + bool isStruct() const; + bool isTyped() const; + bool isFeedback() const; + bool isMultiSample() const; + + ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol, + StringRef Name, ResourceBinding Binding, uint32_t UniqueID) + : Symbol(Symbol), Name(Name), Binding(Binding), UniqueID(UniqueID), + RC(RC), Kind(Kind) {} + +public: + static ResourceInfo SRV(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + dxil::ElementType ElementTy, uint32_t ElementCount, + dxil::ResourceKind Kind); + static ResourceInfo RawBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID); + static ResourceInfo StructuredBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, uint32_t Stride, + Align Alignment); + static ResourceInfo Texture2DMS(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + dxil::ElementType ElementTy, + uint32_t ElementCount, uint32_t SampleCount); + static ResourceInfo + Texture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding, + uint32_t UniqueID, dxil::ElementType ElementTy, + uint32_t ElementCount, uint32_t SampleCount); + + static ResourceInfo UAV(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + dxil::ElementType ElementTy, uint32_t ElementCount, + bool GloballyCoherent, bool IsROV, + dxil::ResourceKind Kind); + static ResourceInfo RWRawBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + bool GloballyCoherent, bool IsROV); + static ResourceInfo RWStructuredBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, uint32_t Stride, + Align Alignment, bool GloballyCoherent, + bool IsROV, bool HasCounter); + static ResourceInfo RWTexture2DMS(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + dxil::ElementType ElementTy, + uint32_t ElementCount, uint32_t SampleCount, + bool GloballyCoherent); + static ResourceInfo + RWTexture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding, + uint32_t UniqueID, dxil::ElementType ElementTy, + uint32_t ElementCount, uint32_t SampleCount, + bool GloballyCoherent); + static ResourceInfo FeedbackTexture2D(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, + dxil::SamplerFeedbackType FeedbackTy); + static ResourceInfo + FeedbackTexture2DArray(Value *Symbol, StringRef Name, ResourceBinding Binding, + uint32_t UniqueID, + dxil::SamplerFeedbackType FeedbackTy); + + static ResourceInfo CBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + uint32_t Size); + + static ResourceInfo Sampler(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + dxil::SamplerType SamplerTy); + + bool operator==(const ResourceInfo &RHS) const; + + MDTuple *getAsMetadata(LLVMContext &Ctx) const; + + ResourceBinding getBinding() const { return Binding; } + std::pair getAnnotateProps() const; +}; + +} // namespace dxil +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 345e09dce0b2b1..1a878126aa0820 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -15,6 +15,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/IR/VectorBuilder.h" #include "llvm/Transforms/Utils/ValueMapper.h" namespace llvm { @@ -394,6 +395,10 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op, /// Fast-math-flags are propagated using the IRBuilder's setting. Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind); +/// Overloaded function to generate vector-predication intrinsics for target +/// reduction. +Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src, + const RecurrenceDescriptor &Desc); /// Create a target reduction of the given vector \p Src for a reduction of the /// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is @@ -414,6 +419,11 @@ Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start); +/// Overloaded function to generate vector-predication intrinsics for ordered +/// reduction. +Value *createOrderedReduction(VectorBuilder &VB, + const RecurrenceDescriptor &Desc, Value *Src, + Value *Start); /// Get the intersection (logical and) of all of the potential IR flags /// of each scalar operation (VL) that will be converted into a vector (I). diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index b4fe6351125b1f..9d564a3279ce77 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -207,13 +207,6 @@ void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); -/// Reports an informative message: print \p Msg for debugging purposes as well -/// as an optimization remark. Uses either \p I as location of the remark, or -/// otherwise \p TheLoop. -void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I = nullptr); - } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 962880f68f0767..df75745645e049 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2754,27 +2754,28 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty, ((Mask & fcPosInf) && Op1V.isPosInfinity()); return ConstantInt::get(Ty, Result); } + case Intrinsic::powi: { + int Exp = static_cast(Op2C->getSExtValue()); + switch (Ty->getTypeID()) { + case Type::HalfTyID: + case Type::FloatTyID: { + APFloat Res(static_cast(std::pow(Op1V.convertToFloat(), Exp))); + if (Ty->isHalfTy()) { + bool Unused; + Res.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, + &Unused); + } + return ConstantFP::get(Ty->getContext(), Res); + } + case Type::DoubleTyID: + return ConstantFP::get(Ty, std::pow(Op1V.convertToDouble(), Exp)); + default: + return nullptr; + } + } default: break; } - - if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy()) - return nullptr; - if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy()) - return ConstantFP::get( - Ty->getContext(), - APFloat((float)std::pow((float)Op1V.convertToDouble(), - (int)Op2C->getZExtValue()))); - if (IntrinsicID == Intrinsic::powi && Ty->isFloatTy()) - return ConstantFP::get( - Ty->getContext(), - APFloat((float)std::pow((float)Op1V.convertToDouble(), - (int)Op2C->getZExtValue()))); - if (IntrinsicID == Intrinsic::powi && Ty->isDoubleTy()) - return ConstantFP::get( - Ty->getContext(), - APFloat((double)std::pow(Op1V.convertToDouble(), - (int)Op2C->getZExtValue()))); } return nullptr; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 699ddf271e9e83..ac6df226784345 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -635,9 +635,8 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, return InstDesc(Select, Prev.getRecKind()); } - // Only match select with single use cmp condition. - if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(), - m_Value()))) + if (!match(I, + m_Select(m_Cmp(Pred, m_Value(), m_Value()), m_Value(), m_Value()))) return InstDesc(false, I); SelectInst *SI = cast(I); diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 0917a362eccf5d..3a7ae577bb068a 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1975,13 +1975,16 @@ static Value *simplifyAndOrWithICmpEq(unsigned Opcode, Value *Op0, Value *Op1, return nullptr; }; - if (Value *Res = - simplifyWithOpReplaced(Op1, A, B, Q, /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse)) + // In the final case (Res == Absorber with inverted predicate), it is safe to + // refine poison during simplification, but not undef. For simplicity always + // disable undef-based folds here. + if (Value *Res = simplifyWithOpReplaced(Op1, A, B, Q.getWithoutUndef(), + /* AllowRefinement */ true, + /* DropFlags */ nullptr, MaxRecurse)) return Simplify(Res); - if (Value *Res = - simplifyWithOpReplaced(Op1, B, A, Q, /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse)) + if (Value *Res = simplifyWithOpReplaced(Op1, B, A, Q.getWithoutUndef(), + /* AllowRefinement */ true, + /* DropFlags */ nullptr, MaxRecurse)) return Simplify(Res); return nullptr; @@ -4300,6 +4303,9 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, bool AllowRefinement, SmallVectorImpl *DropFlags, unsigned MaxRecurse) { + assert((AllowRefinement || !Q.CanUseUndef) && + "If AllowRefinement=false then CanUseUndef=false"); + // Trivial replacement. if (V == Op) return RepOp; @@ -4347,6 +4353,11 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, } else { NewOps.push_back(InstOp); } + + // Bail out if any operand is undef and SimplifyQuery disables undef + // simplification. Constant folding currently doesn't respect this option. + if (isa(NewOps.back()) && !Q.CanUseUndef) + return nullptr; } if (!AnyReplaced) @@ -4467,6 +4478,11 @@ Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, bool AllowRefinement, SmallVectorImpl *DropFlags) { + // If refinement is disabled, also disable undef simplifications (which are + // always refinements) in SimplifyQuery. + if (!AllowRefinement) + return ::simplifyWithOpReplaced(V, Op, RepOp, Q.getWithoutUndef(), + AllowRefinement, DropFlags, RecursionLimit); return ::simplifyWithOpReplaced(V, Op, RepOp, Q, AllowRefinement, DropFlags, RecursionLimit); } @@ -4606,7 +4622,7 @@ static Value *simplifySelectWithICmpEq(Value *CmpLHS, Value *CmpRHS, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { - if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, + if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(), /* AllowRefinement */ false, /* DropFlags */ nullptr, MaxRecurse) == TrueVal) return FalseVal; @@ -5333,6 +5349,14 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, if (Op->getType() == Ty) return Op; + // ptrtoint (ptradd (Ptr, X - ptrtoint(Ptr))) -> X + Value *Ptr, *X; + if (CastOpc == Instruction::PtrToInt && + match(Op, m_PtrAdd(m_Value(Ptr), + m_Sub(m_Value(X), m_PtrToInt(m_Deferred(Ptr))))) && + X->getType() == Ty && Ty == Q.DL.getIndexType(Ptr->getType())) + return X; + return nullptr; } diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 496308a0c247a4..a44d5a3bbe462a 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -290,7 +290,8 @@ void Lint::visitCallBase(CallBase &I) { // TODO: Check more intrinsics - case Intrinsic::memcpy: { + case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: { MemCpyInst *MCI = cast(&I); visitMemoryReference(I, MemoryLocation::getForDest(MCI), MCI->getDestAlign(), nullptr, MemRef::Write); @@ -311,23 +312,6 @@ void Lint::visitCallBase(CallBase &I) { "Undefined behavior: memcpy source and destination overlap", &I); break; } - case Intrinsic::memcpy_inline: { - MemCpyInlineInst *MCII = cast(&I); - const uint64_t Size = MCII->getLength()->getValue().getLimitedValue(); - visitMemoryReference(I, MemoryLocation::getForDest(MCII), - MCII->getDestAlign(), nullptr, MemRef::Write); - visitMemoryReference(I, MemoryLocation::getForSource(MCII), - MCII->getSourceAlign(), nullptr, MemRef::Read); - - // Check that the memcpy arguments don't overlap. The AliasAnalysis API - // isn't expressive enough for what we really want to do. Known partial - // overlap is not distinguished from the case where nothing is known. - const LocationSize LS = LocationSize::precise(Size); - Check(AA->alias(MCII->getSource(), LS, MCII->getDest(), LS) != - AliasResult::MustAlias, - "Undefined behavior: memcpy source and destination overlap", &I); - break; - } case Intrinsic::memmove: { MemMoveInst *MMI = cast(&I); visitMemoryReference(I, MemoryLocation::getForDest(MMI), diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 018861a665c4cd..91994f33f30463 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -206,12 +206,13 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( static std::pair getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, PredicatedScalarEvolution &PSE, - DenseMap> - &PointerBounds) { + DenseMap, + std::pair> &PointerBounds) { ScalarEvolution *SE = PSE.getSE(); auto [Iter, Ins] = PointerBounds.insert( - {PtrExpr, {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); + {{PtrExpr, AccessTy}, + {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); if (!Ins) return Iter->second; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 7be8a18dd72712..f8ec8683983239 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -959,6 +959,32 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts, return KnownOut; } +static KnownBits computeKnownBitsForHorizontalOperation( + const Operator *I, const APInt &DemandedElts, unsigned Depth, + const SimplifyQuery &Q, + const function_ref + KnownBitsFunc) { + APInt DemandedEltsLHS, DemandedEltsRHS; + getHorizDemandedEltsForFirstOperand(Q.DL.getTypeSizeInBits(I->getType()), + DemandedElts, DemandedEltsLHS, + DemandedEltsRHS); + + const auto ComputeForSingleOpFunc = + [Depth, &Q, KnownBitsFunc](const Value *Op, APInt &DemandedEltsOp) { + return KnownBitsFunc( + computeKnownBits(Op, DemandedEltsOp, Depth + 1, Q), + computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1, Q)); + }; + + if (DemandedEltsRHS.isZero()) + return ComputeForSingleOpFunc(I->getOperand(0), DemandedEltsLHS); + if (DemandedEltsLHS.isZero()) + return ComputeForSingleOpFunc(I->getOperand(1), DemandedEltsRHS); + + return ComputeForSingleOpFunc(I->getOperand(0), DemandedEltsLHS) + .intersectWith(ComputeForSingleOpFunc(I->getOperand(1), DemandedEltsRHS)); +} + // Public so this can be used in `SimplifyDemandedUseBits`. KnownBits llvm::analyzeKnownBitsFromAndXorOr(const Operator *I, const KnownBits &KnownLHS, @@ -1684,6 +1710,8 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); Known = KnownBits::ssub_sat(Known, Known2); break; + // Vec reverse preserves bits from input vec. + case Intrinsic::vector_reverse: // for min/max/and/or reduce, any bit common to each element in the // input vec is set in the output. case Intrinsic::vector_reduce_and: @@ -1756,6 +1784,44 @@ static void computeKnownBitsFromOperator(const Operator *I, case Intrinsic::x86_sse42_crc32_64_64: Known.Zero.setBitsFrom(32); break; + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_avx2_phadd_d: + case Intrinsic::x86_avx2_phadd_w: { + Known = computeKnownBitsForHorizontalOperation( + I, DemandedElts, Depth, Q, + [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) { + return KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false, + /*NUW=*/false, KnownLHS, + KnownRHS); + }); + break; + } + case Intrinsic::x86_ssse3_phadd_sw_128: + case Intrinsic::x86_avx2_phadd_sw: { + Known = computeKnownBitsForHorizontalOperation(I, DemandedElts, Depth, + Q, KnownBits::sadd_sat); + break; + } + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_avx2_phsub_d: + case Intrinsic::x86_avx2_phsub_w: { + Known = computeKnownBitsForHorizontalOperation( + I, DemandedElts, Depth, Q, + [](const KnownBits &KnownLHS, const KnownBits &KnownRHS) { + return KnownBits::computeForAddSub(/*Add=*/false, /*NSW=*/false, + /*NUW=*/false, KnownLHS, + KnownRHS); + }); + break; + } + case Intrinsic::x86_ssse3_phsub_sw_128: + case Intrinsic::x86_avx2_phsub_sw: { + Known = computeKnownBitsForHorizontalOperation(I, DemandedElts, Depth, + Q, KnownBits::ssub_sat); + break; + } case Intrinsic::riscv_vsetvli: case Intrinsic::riscv_vsetvlimax: { bool HasAVL = II->getIntrinsicID() == Intrinsic::riscv_vsetvli; @@ -3026,6 +3092,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I, return isNonZeroAdd(DemandedElts, Depth, Q, BitWidth, II->getArgOperand(0), II->getArgOperand(1), /*NSW=*/true, /* NUW=*/false); + // Vec reverse preserves zero/non-zero status from input vec. + case Intrinsic::vector_reverse: // umin/smin/smax/smin/or of all non-zero elements is always non-zero. case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_umax: @@ -5163,6 +5231,11 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, Known.SignBit.reset(); break; } + // reverse preserves all characteristics of the input vec's element. + case Intrinsic::vector_reverse: + Known = computeKnownFPClass(II->getArgOperand(0), II->getFastMathFlags(), + InterestedClasses, Depth + 1, Q); + break; case Intrinsic::trunc: case Intrinsic::floor: case Intrinsic::ceil: @@ -8644,10 +8717,7 @@ llvm::canConvertToMinOrMaxIntrinsic(ArrayRef VL) { if (all_of(VL, [&SelectPattern, &AllCmpSingleUse](Value *I) { Value *LHS, *RHS; auto CurrentPattern = matchSelectPattern(I, LHS, RHS); - if (!SelectPatternResult::isMinOrMax(CurrentPattern.Flavor) || - CurrentPattern.Flavor == SPF_FMINNUM || - CurrentPattern.Flavor == SPF_FMAXNUM || - !I->getType()->isIntOrIntVectorTy()) + if (!SelectPatternResult::isMinOrMax(CurrentPattern.Flavor)) return false; if (SelectPattern.Flavor != SPF_UNKNOWN && SelectPattern.Flavor != CurrentPattern.Flavor) @@ -8666,6 +8736,10 @@ llvm::canConvertToMinOrMaxIntrinsic(ArrayRef VL) { return {Intrinsic::smax, AllCmpSingleUse}; case SPF_UMAX: return {Intrinsic::umax, AllCmpSingleUse}; + case SPF_FMAXNUM: + return {Intrinsic::maxnum, AllCmpSingleUse}; + case SPF_FMINNUM: + return {Intrinsic::minnum, AllCmpSingleUse}; default: llvm_unreachable("unexpected select pattern flavor"); } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index fd1c3378e2495e..cc742ab35f4498 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -567,6 +567,34 @@ void llvm::processShuffleMasks( } } +void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, + const APInt &DemandedElts, + APInt &DemandedLHS, + APInt &DemandedRHS) { + assert(VectorBitWidth >= 128 && "Vectors smaller than 128 bit not supported"); + int NumLanes = VectorBitWidth / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumEltsPerLane = NumElts / NumLanes; + int HalfEltsPerLane = NumEltsPerLane / 2; + + DemandedLHS = APInt::getZero(NumElts); + DemandedRHS = APInt::getZero(NumElts); + + // Map DemandedElts to the horizontal operands. + for (int Idx = 0; Idx != NumElts; ++Idx) { + if (!DemandedElts[Idx]) + continue; + int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; + int LocalIdx = Idx % NumEltsPerLane; + if (LocalIdx < HalfEltsPerLane) { + DemandedLHS.setBit(LaneIdx + 2 * LocalIdx); + } else { + LocalIdx -= HalfEltsPerLane; + DemandedRHS.setBit(LaneIdx + 2 * LocalIdx); + } + } +} + MapVector llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, const TargetTransformInfo *TTI) { diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index cea09bcb453863..ebcf76175a36ba 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -21,7 +21,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/AtomicExpandUtils.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index ccd8f76fb4f634..31fa4c105cef80 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -97,7 +97,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineSinkingPass(Registry); initializeMachineUniformityAnalysisPassPass(Registry); initializeMachineUniformityInfoPrinterPassPass(Registry); - initializeMachineVerifierPassPass(Registry); + initializeMachineVerifierLegacyPassPass(Registry); initializeObjCARCContractLegacyPassPass(Registry); initializeOptimizePHIsPass(Registry); initializePEIPass(Registry); @@ -132,7 +132,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeStripDebugMachineModulePass(Registry); initializeTailDuplicatePass(Registry); initializeTargetPassConfigPass(Registry); - initializeTwoAddressInstructionPassPass(Registry); + initializeTwoAddressInstructionLegacyPassPass(Registry); initializeTypePromotionLegacyPass(Registry); initializeUnpackMachineBundlesPass(Registry); initializeUnreachableBlockElimLegacyPassPass(Registry); diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp index 09e7cfb12bdbad..324329ce989e71 100644 --- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -18,7 +18,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index ee94c0bfbf9d0f..d16585b5650a7d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -149,6 +149,14 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, // Try looking through a bitcast from one function type to another. // Commonly happens with calls to objc_msgSend(). const Value *CalleeV = CB.getCalledOperand()->stripPointerCasts(); + + // If IRTranslator chose to drop the ptrauth info, we can turn this into + // a direct call. + if (!PAI && CB.countOperandBundlesOfType(LLVMContext::OB_ptrauth)) { + CalleeV = cast(CalleeV)->getPointer(); + assert(isa(CalleeV)); + } + if (const Function *F = dyn_cast(CalleeV)) { if (F->hasFnAttribute(Attribute::NonLazyBind)) { LLT Ty = getLLTForType(*F->getType(), DL); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index d348c2b86916f2..97be19825fcf35 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -38,7 +38,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/CodeGen/TargetFrameLowering.h" @@ -2649,17 +2649,24 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } std::optional PAI; - if (CB.countOperandBundlesOfType(LLVMContext::OB_ptrauth)) { + if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_ptrauth)) { // Functions should never be ptrauth-called directly. assert(!CB.getCalledFunction() && "invalid direct ptrauth call"); - auto PAB = CB.getOperandBundle("ptrauth"); - const Value *Key = PAB->Inputs[0]; - const Value *Discriminator = PAB->Inputs[1]; - - Register DiscReg = getOrCreateVReg(*Discriminator); - PAI = CallLowering::PtrAuthInfo{cast(Key)->getZExtValue(), - DiscReg}; + const Value *Key = Bundle->Inputs[0]; + const Value *Discriminator = Bundle->Inputs[1]; + + // Look through ptrauth constants to try to eliminate the matching bundle + // and turn this into a direct call with no ptrauth. + // CallLowering will use the raw pointer if it doesn't find the PAI. + const auto *CalleeCPA = dyn_cast(CB.getCalledOperand()); + if (!CalleeCPA || !isa(CalleeCPA->getPointer()) || + !CalleeCPA->isKnownCompatibleWith(Key, Discriminator, *DL)) { + // If we can't make it direct, package the bundle into PAI. + Register DiscReg = getOrCreateVReg(*Discriminator); + PAI = CallLowering::PtrAuthInfo{cast(Key)->getZExtValue(), + DiscReg}; + } } Register ConvergenceCtrlToken = 0; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index f717849317ba72..b58c96a8668836 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -25,7 +25,7 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -1154,16 +1154,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { } case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: { - // FIXME: Support other types unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64)) + Type *ToTy = + getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg())); + if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy) return UnableToLegalize; LegalizeResult Status = conversionLibcall( - MI, MIRBuilder, - ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx), - FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx), - LocObserver); + MI, MIRBuilder, ToTy, Type::getIntNTy(Ctx, FromSize), LocObserver); if (Status != Legalized) return Status; break; @@ -3421,6 +3418,54 @@ LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, return UnableToLegalize; } +// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly +// those that have smaller than legal operands. +// +// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8> +// +// ===> +// +// s32 = G_BITCAST <4 x s8> +// s32 = G_BITCAST <4 x s8> +// s32 = G_BITCAST <4 x s8> +// s32 = G_BITCAST <4 x s8> +// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32 +// <16 x s8> = G_BITCAST <4 x s32> +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + // Convert it to CONCAT instruction + auto ConcatMI = dyn_cast(&MI); + if (!ConcatMI) { + return UnableToLegalize; + } + + // Check if bitcast is Legal + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits()); + + // Check if the build vector is Legal + if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) { + return UnableToLegalize; + } + + // Bitcast the sources + SmallVector BitcastRegs; + for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) { + BitcastRegs.push_back( + MIRBuilder.buildBitcast(SrcScalTy, ConcatMI->getSourceReg(i)) + .getReg(0)); + } + + // Build the scalar values into a vector + Register BuildReg = + MIRBuilder.buildBuildVector(CastTy, BitcastRegs).getReg(0); + MIRBuilder.buildBitcast(DstReg, BuildReg); + + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT Register DstReg = LoadMI.getDstReg(); @@ -3725,6 +3770,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { return bitcastExtractVectorElt(MI, TypeIdx, CastTy); case TargetOpcode::G_INSERT_VECTOR_ELT: return bitcastInsertVectorElt(MI, TypeIdx, CastTy); + case TargetOpcode::G_CONCAT_VECTORS: + return bitcastConcatVector(MI, TypeIdx, CastTy); default: return UnableToLegalize; } diff --git a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp index 1c31eba909e780..039f07f2e5e3f9 100644 --- a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp +++ b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -31,6 +31,14 @@ DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( /*SkipDebugLoc=*/true); } +bool MachineOptimizationRemarkEmitter::invalidate( + MachineFunction &MF, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv) { + // This analysis has no state and so can be trivially preserved but it needs + // a fresh view of BFI if it was constructed with one. + return MBFI && Inv.invalidate(MF, PA); +} + std::optional MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) { if (!MBFI) @@ -86,6 +94,18 @@ void MachineOptimizationRemarkEmitterPass::getAnalysisUsage( MachineFunctionPass::getAnalysisUsage(AU); } +AnalysisKey MachineOptimizationRemarkEmitterAnalysis::Key; + +MachineOptimizationRemarkEmitterAnalysis::Result +MachineOptimizationRemarkEmitterAnalysis::run( + MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { + MachineBlockFrequencyInfo *MBFI = + MF.getFunction().getContext().getDiagnosticsHotnessRequested() + ? &MFAM.getResult(MF) + : nullptr; + return Result(MF, MBFI); +} + char MachineOptimizationRemarkEmitterPass::ID = 0; static const char ore_name[] = "Machine Optimization Remark Emitter"; #define ORE_NAME "machine-opt-remark-emitter" diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index d0d3af0e5e4fcc..0a5b8bdbc93713 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -20,6 +20,7 @@ // -verify-machineinstrs. //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineVerifier.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -93,6 +94,9 @@ using namespace llvm; namespace { struct MachineVerifier { + MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b) + : MFAM(&MFAM), Banner(b) {} + MachineVerifier(Pass *pass, const char *b) : PASS(pass), Banner(b) {} MachineVerifier(const char *b, LiveVariables *LiveVars, @@ -103,6 +107,7 @@ namespace { unsigned verify(const MachineFunction &MF); + MachineFunctionAnalysisManager *MFAM = nullptr; Pass *const PASS = nullptr; const char *Banner; const MachineFunction *MF = nullptr; @@ -302,15 +307,15 @@ namespace { void verifyProperties(const MachineFunction &MF); }; - struct MachineVerifierPass : public MachineFunctionPass { + struct MachineVerifierLegacyPass : public MachineFunctionPass { static char ID; // Pass ID, replacement for typeid const std::string Banner; - MachineVerifierPass(std::string banner = std::string()) - : MachineFunctionPass(ID), Banner(std::move(banner)) { - initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry()); - } + MachineVerifierLegacyPass(std::string banner = std::string()) + : MachineFunctionPass(ID), Banner(std::move(banner)) { + initializeMachineVerifierLegacyPassPass(*PassRegistry::getPassRegistry()); + } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addUsedIfAvailable(); @@ -338,13 +343,28 @@ namespace { } // end anonymous namespace -char MachineVerifierPass::ID = 0; +PreservedAnalyses +MachineVerifierPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + // Skip functions that have known verification problems. + // FIXME: Remove this mechanism when all problematic passes have been + // fixed. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailsVerification)) + return PreservedAnalyses::all(); + unsigned FoundErrors = MachineVerifier(MFAM, Banner.c_str()).verify(MF); + if (FoundErrors) + report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors."); + return PreservedAnalyses::all(); +} + +char MachineVerifierLegacyPass::ID = 0; -INITIALIZE_PASS(MachineVerifierPass, "machineverifier", +INITIALIZE_PASS(MachineVerifierLegacyPass, "machineverifier", "Verify generated machine code", false, false) FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) { - return new MachineVerifierPass(Banner); + return new MachineVerifierLegacyPass(Banner); } void llvm::verifyMachineFunction(const std::string &Banner, @@ -438,6 +458,14 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) { auto *SIWrapper = PASS->getAnalysisIfAvailable(); Indexes = SIWrapper ? &SIWrapper->getSI() : nullptr; } + if (MFAM) { + MachineFunction &Func = const_cast(MF); + LiveInts = MFAM->getCachedResult(Func); + if (!LiveInts) + LiveVars = MFAM->getCachedResult(Func); + // TODO: LiveStks = MFAM->getCachedResult(Func); + Indexes = MFAM->getCachedResult(Func); + } verifySlotIndexes(); diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 8572cdc1604562..19950f3eb67bad 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -230,6 +230,21 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { break; } + case Intrinsic::memcpy_inline: { + // Only expand llvm.memcpy.inline with non-constant length in this + // codepath, leaving the current SelectionDAG expansion for constant + // length memcpy intrinsics undisturbed. + auto *Memcpy = cast(Inst); + if (isa(Memcpy->getLength())) + break; + + Function *ParentFunc = Memcpy->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + expandMemCpyAsLoop(Memcpy, TTI); + Changed = true; + Memcpy->eraseFromParent(); + break; + } case Intrinsic::memmove: { auto *Memmove = cast(Inst); Function *ParentFunc = Memmove->getFunction(); @@ -291,6 +306,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { default: break; case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::memset_inline: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cece76f6583077..302ad128f4f532 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -37,7 +37,7 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" @@ -2300,24 +2300,12 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, return true; } - if (N.getOpcode() != ISD::SETCC || - N.getValueType().getScalarType() != MVT::i1 || - cast(N.getOperand(2))->get() != ISD::SETNE) - return false; - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - assert(Op0.getValueType() == Op1.getValueType()); - - if (isNullOrNullSplat(Op0)) - Op = Op1; - else if (isNullOrNullSplat(Op1)) - Op = Op0; - else + if (N.getValueType().getScalarType() != MVT::i1 || + !sd_match( + N, m_c_SetCC(m_Value(Op), m_Zero(), m_SpecificCondCode(ISD::SETNE)))) return false; Known = DAG.computeKnownBits(Op); - return (Known.Zero | 1).isAllOnes(); } @@ -2544,16 +2532,12 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, const SDLoc &DL, return SDValue(); // Match the zext operand as a setcc of a boolean. - if (Z.getOperand(0).getOpcode() != ISD::SETCC || - Z.getOperand(0).getValueType() != MVT::i1) + if (Z.getOperand(0).getValueType() != MVT::i1) return SDValue(); // Match the compare as: setcc (X & 1), 0, eq. - SDValue SetCC = Z.getOperand(0); - ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); - if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) || - SetCC.getOperand(0).getOpcode() != ISD::AND || - !isOneConstant(SetCC.getOperand(0).getOperand(1))) + if (!sd_match(Z.getOperand(0), m_SetCC(m_And(m_Value(), m_One()), m_Zero(), + m_SpecificCondCode(ISD::SETEQ)))) return SDValue(); // We are adding/subtracting a constant and an inverted low bit. Turn that @@ -2561,9 +2545,9 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, const SDLoc &DL, // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1)) // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1)) EVT VT = C.getValueType(); - SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT); - SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) : - DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); + SDValue LowBit = DAG.getZExtOrTrunc(Z.getOperand(0).getOperand(0), DL, VT); + SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) + : DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit); } @@ -2661,8 +2645,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N1, N0); if (areBitwiseNotOfEachother(N0, N1)) - return DAG.getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), - SDLoc(N), VT); + return DAG.getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), DL, VT); // fold vector ops if (VT.isVector()) { @@ -2759,7 +2742,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return SD; } - SDValue A, B, C; + SDValue A, B, C, D; // fold ((0-A) + B) -> B-A if (sd_match(N0, m_Neg(m_Value(A)))) @@ -2799,18 +2782,12 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return DAG.getNode(N1.getOpcode(), DL, VT, B, C); // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant - if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && - N0->hasOneUse() && N1->hasOneUse()) { - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); - SDValue N10 = N1.getOperand(0); - SDValue N11 = N1.getOperand(1); - - if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), - DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); - } + if (sd_match(N0, m_OneUse(m_Sub(m_Value(A), m_Value(B)))) && + sd_match(N1, m_OneUse(m_Sub(m_Value(C), m_Value(D)))) && + (isConstantOrConstantVector(A) || isConstantOrConstantVector(C))) + return DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::ADD, SDLoc(N0), VT, A, C), + DAG.getNode(ISD::ADD, SDLoc(N1), VT, B, D)); // fold (add (umax X, C), -C) --> (usubsat X, C) if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) { @@ -11554,13 +11531,12 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) { SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); - if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse()) - return SDValue(); - SDValue Cond0 = N0.getOperand(0); - SDValue Cond1 = N0.getOperand(1); - ISD::CondCode CC = cast(N0.getOperand(2))->get(); - if (VT != Cond0.getValueType()) + SDValue Cond0, Cond1; + ISD::CondCode CC; + if (!sd_match(N0, m_OneUse(m_SetCC(m_Value(Cond0), m_Value(Cond1), + m_CondCode(CC)))) || + VT != Cond0.getValueType()) return SDValue(); // Match a signbit check of Cond0 as "Cond0 s<0". Swap select operands if the @@ -17509,10 +17485,10 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc DL(N); // fold (fcopysign c1, c2) -> fcopysign(c1,c2) - if (SDValue C = - DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1})) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1})) return C; if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { @@ -17521,10 +17497,10 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) if (!V.isNegative()) { if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) - return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); + return DAG.getNode(ISD::FABS, DL, VT, N0); } else { if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) - return DAG.getNode(ISD::FNEG, SDLoc(N), VT, + return DAG.getNode(ISD::FNEG, DL, VT, DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); } } @@ -17534,20 +17510,20 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { // copysign(copysign(x,z), y) -> copysign(x, y) if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1); // copysign(x, abs(y)) -> abs(x) if (N1.getOpcode() == ISD::FABS) - return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); + return DAG.getNode(ISD::FABS, DL, VT, N0); // copysign(x, copysign(y,z)) -> copysign(x, z) if (N1.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); + return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1)); // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) - return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); + return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0)); // We only take the sign bit from the sign operand. EVT SignVT = N1.getValueType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 9f515739ee0481..7f5b46af01c62f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -25,7 +25,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetFrameLowering.h" diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 897bdc71818f8a..9bd0d1c51fbc27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -36,7 +36,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b0746014daf5ac..51cbdd9b3ad310 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -44,7 +44,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/StackMaps.h" @@ -9454,6 +9454,14 @@ void SelectionDAGBuilder::LowerCallSiteWithPtrAuthBundle( assert(Discriminator->getType()->isIntegerTy(64) && "Invalid ptrauth discriminator"); + // Look through ptrauth constants to find the raw callee. + // Do a direct unauthenticated call if we found it and everything matches. + if (const auto *CalleeCPA = dyn_cast(CalleeV)) + if (CalleeCPA->isKnownCompatibleWith(Key, Discriminator, + DAG.getDataLayout())) + return LowerCallTo(CB, getValue(CalleeCPA->getPointer()), CB.isTailCall(), + CB.isMustTailCall(), EHPadBB); + // Functions should never be ptrauth-called directly. assert(!isa(CalleeV) && "invalid direct ptrauth call"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index ecdbf3e963b835..df3d207d85d351 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2220,24 +2220,27 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS, /// by tblgen. Others should not call it. void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector &Ops, const SDLoc &DL) { - std::vector InOps; - std::swap(InOps, Ops); + // Change the vector of SDValue into a list of SDNodeHandle for x86 might call + // replaceAllUses when matching address. - Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0 - Ops.push_back(InOps[InlineAsm::Op_AsmString]); // 1 - Ops.push_back(InOps[InlineAsm::Op_MDNode]); // 2, !srcloc - Ops.push_back(InOps[InlineAsm::Op_ExtraInfo]); // 3 (SideEffect, AlignStack) + std::list Handles; - unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size(); - if (InOps[e-1].getValueType() == MVT::Glue) + Handles.emplace_back(Ops[InlineAsm::Op_InputChain]); // 0 + Handles.emplace_back(Ops[InlineAsm::Op_AsmString]); // 1 + Handles.emplace_back(Ops[InlineAsm::Op_MDNode]); // 2, !srcloc + Handles.emplace_back( + Ops[InlineAsm::Op_ExtraInfo]); // 3 (SideEffect, AlignStack) + + unsigned i = InlineAsm::Op_FirstOperand, e = Ops.size(); + if (Ops[e - 1].getValueType() == MVT::Glue) --e; // Don't process a glue operand if it is here. while (i != e) { - InlineAsm::Flag Flags(InOps[i]->getAsZExtVal()); + InlineAsm::Flag Flags(Ops[i]->getAsZExtVal()); if (!Flags.isMemKind() && !Flags.isFuncKind()) { // Just skip over this operand, copying the operands verbatim. - Ops.insert(Ops.end(), InOps.begin() + i, - InOps.begin() + i + Flags.getNumOperandRegisters() + 1); + Handles.insert(Handles.end(), Ops.begin() + i, + Ops.begin() + i + Flags.getNumOperandRegisters() + 1); i += Flags.getNumOperandRegisters() + 1; } else { assert(Flags.getNumOperandRegisters() == 1 && @@ -2247,10 +2250,10 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector &Ops, if (Flags.isUseOperandTiedToDef(TiedToOperand)) { // We need the constraint ID from the operand this is tied to. unsigned CurOp = InlineAsm::Op_FirstOperand; - Flags = InlineAsm::Flag(InOps[CurOp]->getAsZExtVal()); + Flags = InlineAsm::Flag(Ops[CurOp]->getAsZExtVal()); for (; TiedToOperand; --TiedToOperand) { CurOp += Flags.getNumOperandRegisters() + 1; - Flags = InlineAsm::Flag(InOps[CurOp]->getAsZExtVal()); + Flags = InlineAsm::Flag(Ops[CurOp]->getAsZExtVal()); } } @@ -2258,7 +2261,7 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector &Ops, std::vector SelOps; const InlineAsm::ConstraintCode ConstraintID = Flags.getMemoryConstraintID(); - if (SelectInlineAsmMemoryOperand(InOps[i+1], ConstraintID, SelOps)) + if (SelectInlineAsmMemoryOperand(Ops[i + 1], ConstraintID, SelOps)) report_fatal_error("Could not match memory address. Inline asm" " failure!"); @@ -2267,15 +2270,19 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector &Ops, : InlineAsm::Kind::Func, SelOps.size()); Flags.setMemConstraint(ConstraintID); - Ops.push_back(CurDAG->getTargetConstant(Flags, DL, MVT::i32)); - llvm::append_range(Ops, SelOps); + Handles.emplace_back(CurDAG->getTargetConstant(Flags, DL, MVT::i32)); + Handles.insert(Handles.end(), SelOps.begin(), SelOps.end()); i += 2; } } // Add the glue input back if present. - if (e != InOps.size()) - Ops.push_back(InOps.back()); + if (e != Ops.size()) + Handles.emplace_back(Ops.back()); + + Ops.clear(); + for (auto &handle : Handles) + Ops.push_back(handle.getValue()); } /// findGlueUse - Return use of MVT::Glue value produced by the specified diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 671ec84fb94163..4268da8670d500 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -26,7 +26,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/StackMaps.h" diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 690a86bd4606c1..1433c8821248d1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2586,6 +2586,17 @@ bool TargetLowering::SimplifyDemandedBits( break; if (Src.getNode()->hasOneUse()) { + if (isTruncateFree(Src, VT) && + !isTruncateFree(Src.getValueType(), VT)) { + // If truncate is only free at trunc(srl), do not turn it into + // srl(trunc). The check is done by first check the truncate is free + // at Src's opcode(srl), then check the truncate is not done by + // referencing sub-register. In test, if both trunc(srl) and + // srl(trunc)'s trunc are free, srl(trunc) performs better. If only + // trunc(srl)'s trunc is free, trunc(srl) is better. + break; + } + std::optional ShAmtC = TLO.DAG.getValidShiftAmount(Src, DemandedElts, Depth + 2); if (!ShAmtC || *ShAmtC >= BitWidth) @@ -2596,7 +2607,6 @@ bool TargetLowering::SimplifyDemandedBits( APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth); HighBits.lshrInPlace(ShVal); HighBits = HighBits.trunc(BitWidth); - if (!(HighBits & DemandedBits)) { // None of the shifted in bits are needed. Add a truncate of the // shift input, then shift it. @@ -10381,14 +10391,28 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const { auto LTPredicate = (Opcode == ISD::UCMP ? ISD::SETULT : ISD::SETLT); auto GTPredicate = (Opcode == ISD::UCMP ? ISD::SETUGT : ISD::SETGT); - SDValue IsLT = DAG.getSetCC(dl, BoolVT, LHS, RHS, LTPredicate); SDValue IsGT = DAG.getSetCC(dl, BoolVT, LHS, RHS, GTPredicate); - SDValue SelectZeroOrOne = - DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT), - DAG.getConstant(0, dl, ResVT)); - return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT), - SelectZeroOrOne); + + // We can't perform arithmetic on i1 values. Extending them would + // probably result in worse codegen, so let's just use two selects instead. + // Some targets are also just better off using selects rather than subtraction + // because one of the conditions can be merged with one of the selects. + // And finally, if we don't know the contents of high bits of a boolean value + // we can't perform any arithmetic either. + if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 || + getBooleanContents(BoolVT) == UndefinedBooleanContent) { + SDValue SelectZeroOrOne = + DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT), + DAG.getConstant(0, dl, ResVT)); + return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT), + SelectZeroOrOne); + } + + if (getBooleanContents(BoolVT) == ZeroOrNegativeOneBooleanContent) + std::swap(IsGT, IsLT); + return DAG.getSExtOrTrunc(DAG.getNode(ISD::SUB, dl, BoolVT, IsGT, IsLT), dl, + ResVT); } SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index eccac0e218c58e..bf031c00a24491 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -28,7 +28,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -98,350 +98,6 @@ static cl::opt DisableStrictNodeMutation("disable-strictnode-mutation", cl::desc("Don't mutate strict-float node to a legalize node"), cl::init(false), cl::Hidden); -static bool darwinHasSinCos(const Triple &TT) { - assert(TT.isOSDarwin() && "should be called with darwin triple"); - // Don't bother with 32 bit x86. - if (TT.getArch() == Triple::x86) - return false; - // Macos < 10.9 has no sincos_stret. - if (TT.isMacOSX()) - return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit(); - // iOS < 7.0 has no sincos_stret. - if (TT.isiOS()) - return !TT.isOSVersionLT(7, 0); - // Any other darwin such as WatchOS/TvOS is new enough. - return true; -} - -void TargetLoweringBase::InitLibcalls(const Triple &TT) { -#define HANDLE_LIBCALL(code, name) \ - setLibcallName(RTLIB::code, name); -#include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL - // Initialize calling conventions to their default. - for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) - setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C); - - // Use the f128 variants of math functions on x86_64 - if (TT.getArch() == Triple::ArchType::x86_64 && TT.isGNUEnvironment()) { - setLibcallName(RTLIB::REM_F128, "fmodf128"); - setLibcallName(RTLIB::FMA_F128, "fmaf128"); - setLibcallName(RTLIB::SQRT_F128, "sqrtf128"); - setLibcallName(RTLIB::CBRT_F128, "cbrtf128"); - setLibcallName(RTLIB::LOG_F128, "logf128"); - setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite"); - setLibcallName(RTLIB::LOG2_F128, "log2f128"); - setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite"); - setLibcallName(RTLIB::LOG10_F128, "log10f128"); - setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite"); - setLibcallName(RTLIB::EXP_F128, "expf128"); - setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite"); - setLibcallName(RTLIB::EXP2_F128, "exp2f128"); - setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite"); - setLibcallName(RTLIB::EXP10_F128, "exp10f128"); - setLibcallName(RTLIB::SIN_F128, "sinf128"); - setLibcallName(RTLIB::COS_F128, "cosf128"); - setLibcallName(RTLIB::TAN_F128, "tanf128"); - setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); - setLibcallName(RTLIB::ASIN_F128, "asinf128"); - setLibcallName(RTLIB::ACOS_F128, "acosf128"); - setLibcallName(RTLIB::ATAN_F128, "atanf128"); - setLibcallName(RTLIB::SINH_F128, "sinhf128"); - setLibcallName(RTLIB::COSH_F128, "coshf128"); - setLibcallName(RTLIB::TANH_F128, "tanhf128"); - setLibcallName(RTLIB::POW_F128, "powf128"); - setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite"); - setLibcallName(RTLIB::CEIL_F128, "ceilf128"); - setLibcallName(RTLIB::TRUNC_F128, "truncf128"); - setLibcallName(RTLIB::RINT_F128, "rintf128"); - setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128"); - setLibcallName(RTLIB::ROUND_F128, "roundf128"); - setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128"); - setLibcallName(RTLIB::FLOOR_F128, "floorf128"); - setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128"); - setLibcallName(RTLIB::FMIN_F128, "fminf128"); - setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); - setLibcallName(RTLIB::LROUND_F128, "lroundf128"); - setLibcallName(RTLIB::LLROUND_F128, "llroundf128"); - setLibcallName(RTLIB::LRINT_F128, "lrintf128"); - setLibcallName(RTLIB::LLRINT_F128, "llrintf128"); - setLibcallName(RTLIB::LDEXP_F128, "ldexpf128"); - setLibcallName(RTLIB::FREXP_F128, "frexpf128"); - } - - // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf". - if (TT.isPPC()) { - setLibcallName(RTLIB::ADD_F128, "__addkf3"); - setLibcallName(RTLIB::SUB_F128, "__subkf3"); - setLibcallName(RTLIB::MUL_F128, "__mulkf3"); - setLibcallName(RTLIB::DIV_F128, "__divkf3"); - setLibcallName(RTLIB::POWI_F128, "__powikf2"); - setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2"); - setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2"); - setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2"); - setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2"); - setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi"); - setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi"); - setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti"); - setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi"); - setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi"); - setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti"); - setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf"); - setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf"); - setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf"); - setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf"); - setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf"); - setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf"); - setLibcallName(RTLIB::OEQ_F128, "__eqkf2"); - setLibcallName(RTLIB::UNE_F128, "__nekf2"); - setLibcallName(RTLIB::OGE_F128, "__gekf2"); - setLibcallName(RTLIB::OLT_F128, "__ltkf2"); - setLibcallName(RTLIB::OLE_F128, "__lekf2"); - setLibcallName(RTLIB::OGT_F128, "__gtkf2"); - setLibcallName(RTLIB::UO_F128, "__unordkf2"); - } - - // A few names are different on particular architectures or environments. - if (TT.isOSDarwin()) { - // For f16/f32 conversions, Darwin uses the standard naming scheme, instead - // of the gnueabi-style __gnu_*_ieee. - // FIXME: What about other targets? - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - - // Some darwins have an optimized __bzero/bzero function. - switch (TT.getArch()) { - case Triple::x86: - case Triple::x86_64: - if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6)) - setLibcallName(RTLIB::BZERO, "__bzero"); - break; - case Triple::aarch64: - case Triple::aarch64_32: - setLibcallName(RTLIB::BZERO, "bzero"); - break; - default: - break; - } - - if (darwinHasSinCos(TT)) { - setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret"); - setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret"); - if (TT.isWatchABI()) { - setLibcallCallingConv(RTLIB::SINCOS_STRET_F32, - CallingConv::ARM_AAPCS_VFP); - setLibcallCallingConv(RTLIB::SINCOS_STRET_F64, - CallingConv::ARM_AAPCS_VFP); - } - } - - switch (TT.getOS()) { - case Triple::MacOSX: - if (TT.isMacOSXVersionLT(10, 9)) { - setLibcallName(RTLIB::EXP10_F32, nullptr); - setLibcallName(RTLIB::EXP10_F64, nullptr); - } else { - setLibcallName(RTLIB::EXP10_F32, "__exp10f"); - setLibcallName(RTLIB::EXP10_F64, "__exp10"); - } - break; - case Triple::IOS: - if (TT.isOSVersionLT(7, 0)) { - setLibcallName(RTLIB::EXP10_F32, nullptr); - setLibcallName(RTLIB::EXP10_F64, nullptr); - break; - } - [[fallthrough]]; - case Triple::TvOS: - case Triple::WatchOS: - case Triple::XROS: - setLibcallName(RTLIB::EXP10_F32, "__exp10f"); - setLibcallName(RTLIB::EXP10_F64, "__exp10"); - break; - default: - break; - } - } else { - setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); - } - - if (TT.isGNUEnvironment() || TT.isOSFuchsia() || - (TT.isAndroid() && !TT.isAndroidVersionLT(9))) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - setLibcallName(RTLIB::SINCOS_F80, "sincosl"); - setLibcallName(RTLIB::SINCOS_F128, "sincosl"); - setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl"); - } - - if (TT.isPS()) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - } - - if (TT.isOSOpenBSD()) { - setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); - } - - if (TT.isOSWindows() && !TT.isOSCygMing()) { - setLibcallName(RTLIB::LDEXP_F32, nullptr); - setLibcallName(RTLIB::LDEXP_F80, nullptr); - setLibcallName(RTLIB::LDEXP_F128, nullptr); - setLibcallName(RTLIB::LDEXP_PPCF128, nullptr); - - setLibcallName(RTLIB::FREXP_F32, nullptr); - setLibcallName(RTLIB::FREXP_F80, nullptr); - setLibcallName(RTLIB::FREXP_F128, nullptr); - setLibcallName(RTLIB::FREXP_PPCF128, nullptr); - } - - if (TT.isAArch64()) { - if (TT.isOSMSVCRT()) { - // MSVCRT doesn't have powi; fall back to pow - setLibcallName(RTLIB::POWI_F32, nullptr); - setLibcallName(RTLIB::POWI_F64, nullptr); - } - } - - // Disable most libcalls on AMDGPU. - if (TT.isAMDGPU()) { - for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { - if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) - setLibcallName(static_cast(I), nullptr); - } - } - - // Disable most libcalls on NVPTX. - if (TT.isNVPTX()) { - for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) - if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) - setLibcallName(static_cast(I), nullptr); - } - - if (TT.isARM() || TT.isThumb()) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MULO_I128, nullptr); - - if (TT.isOSMSVCRT()) { - // MSVCRT doesn't have powi; fall back to pow - setLibcallName(RTLIB::POWI_F32, nullptr); - setLibcallName(RTLIB::POWI_F64, nullptr); - } - } - - if (TT.getArch() == Triple::ArchType::avr) { - // Division rtlib functions (not supported), use divmod functions instead - setLibcallName(RTLIB::SDIV_I8, nullptr); - setLibcallName(RTLIB::SDIV_I16, nullptr); - setLibcallName(RTLIB::SDIV_I32, nullptr); - setLibcallName(RTLIB::UDIV_I8, nullptr); - setLibcallName(RTLIB::UDIV_I16, nullptr); - setLibcallName(RTLIB::UDIV_I32, nullptr); - - // Modulus rtlib functions (not supported), use divmod functions instead - setLibcallName(RTLIB::SREM_I8, nullptr); - setLibcallName(RTLIB::SREM_I16, nullptr); - setLibcallName(RTLIB::SREM_I32, nullptr); - setLibcallName(RTLIB::UREM_I8, nullptr); - setLibcallName(RTLIB::UREM_I16, nullptr); - setLibcallName(RTLIB::UREM_I32, nullptr); - } - - if (TT.getArch() == Triple::ArchType::hexagon) { - // These cause problems when the shift amount is non-constant. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - } - - if (TT.isLoongArch()) { - if (!TT.isLoongArch64()) { - // Set libcalls. - setLibcallName(RTLIB::MUL_I128, nullptr); - // The MULO libcall is not part of libgcc, only compiler-rt. - setLibcallName(RTLIB::MULO_I64, nullptr); - } - // The MULO libcall is not part of libgcc, only compiler-rt. - setLibcallName(RTLIB::MULO_I128, nullptr); - } - - if (TT.isMIPS32()) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MULO_I128, nullptr); - } - - if (TT.isPPC()) { - if (!TT.isPPC64()) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::MULO_I64, nullptr); - } - setLibcallName(RTLIB::MULO_I128, nullptr); - } - - if (TT.isRISCV32()) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::MULO_I64, nullptr); - } - - if (TT.isSPARC()) { - if (!TT.isSPARC64()) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - } - setLibcallName(RTLIB::MULO_I128, nullptr); - } - - if (TT.isSystemZ()) { - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - } - - if (TT.isX86()) { - if (TT.getArch() == Triple::ArchType::x86) { - // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, nullptr); - setLibcallName(RTLIB::SRL_I128, nullptr); - setLibcallName(RTLIB::SRA_I128, nullptr); - setLibcallName(RTLIB::MUL_I128, nullptr); - // The MULO libcall is not part of libgcc, only compiler-rt. - setLibcallName(RTLIB::MULO_I64, nullptr); - } - - // The MULO libcall is not part of libgcc, only compiler-rt. - setLibcallName(RTLIB::MULO_I128, nullptr); - - if (TT.isOSMSVCRT()) { - // MSVCRT doesn't have powi; fall back to pow - setLibcallName(RTLIB::POWI_F32, nullptr); - setLibcallName(RTLIB::POWI_F64, nullptr); - } - } -} - /// GetFPLibCall - Helper to return the right libcall for the given floating /// point type, or UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPLibCall(EVT VT, @@ -918,41 +574,9 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { } } -/// InitCmpLibcallCCs - Set default comparison libcall CC. -static void InitCmpLibcallCCs(ISD::CondCode *CCs) { - std::fill(CCs, CCs + RTLIB::UNKNOWN_LIBCALL, ISD::SETCC_INVALID); - CCs[RTLIB::OEQ_F32] = ISD::SETEQ; - CCs[RTLIB::OEQ_F64] = ISD::SETEQ; - CCs[RTLIB::OEQ_F128] = ISD::SETEQ; - CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ; - CCs[RTLIB::UNE_F32] = ISD::SETNE; - CCs[RTLIB::UNE_F64] = ISD::SETNE; - CCs[RTLIB::UNE_F128] = ISD::SETNE; - CCs[RTLIB::UNE_PPCF128] = ISD::SETNE; - CCs[RTLIB::OGE_F32] = ISD::SETGE; - CCs[RTLIB::OGE_F64] = ISD::SETGE; - CCs[RTLIB::OGE_F128] = ISD::SETGE; - CCs[RTLIB::OGE_PPCF128] = ISD::SETGE; - CCs[RTLIB::OLT_F32] = ISD::SETLT; - CCs[RTLIB::OLT_F64] = ISD::SETLT; - CCs[RTLIB::OLT_F128] = ISD::SETLT; - CCs[RTLIB::OLT_PPCF128] = ISD::SETLT; - CCs[RTLIB::OLE_F32] = ISD::SETLE; - CCs[RTLIB::OLE_F64] = ISD::SETLE; - CCs[RTLIB::OLE_F128] = ISD::SETLE; - CCs[RTLIB::OLE_PPCF128] = ISD::SETLE; - CCs[RTLIB::OGT_F32] = ISD::SETGT; - CCs[RTLIB::OGT_F64] = ISD::SETGT; - CCs[RTLIB::OGT_F128] = ISD::SETGT; - CCs[RTLIB::OGT_PPCF128] = ISD::SETGT; - CCs[RTLIB::UO_F32] = ISD::SETNE; - CCs[RTLIB::UO_F64] = ISD::SETNE; - CCs[RTLIB::UO_F128] = ISD::SETNE; - CCs[RTLIB::UO_PPCF128] = ISD::SETNE; -} - /// NOTE: The TargetMachine owns TLOF. -TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { +TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) + : TM(tm), Libcalls(TM.getTargetTriple()) { initActions(); // Perform these initializations only once. @@ -984,11 +608,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { MinCmpXchgSizeInBits = 0; SupportsUnalignedAtomics = false; - - std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr); - - InitLibcalls(TM.getTargetTriple()); - InitCmpLibcallCCs(CmpLibcallCCs); } void TargetLoweringBase::initActions() { diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 73385fee019b02..665d57841a97b6 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -26,6 +26,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/TwoAddressInstructionPass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -36,10 +37,12 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -86,7 +89,7 @@ static cl::opt MaxDataFlowEdge( namespace { -class TwoAddressInstructionPass : public MachineFunctionPass { +class TwoAddressInstructionImpl { MachineFunction *MF = nullptr; const TargetInstrInfo *TII = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -185,11 +188,31 @@ class TwoAddressInstructionPass : public MachineFunctionPass { void eliminateRegSequence(MachineBasicBlock::iterator&); bool processStatepoint(MachineInstr *MI, TiedOperandMap &TiedOperands); +public: + TwoAddressInstructionImpl(MachineFunction &MF, MachineFunctionPass *P); + TwoAddressInstructionImpl(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + void setOptLevel(CodeGenOptLevel Level) { OptLevel = Level; } + bool run(); +}; + +class TwoAddressInstructionLegacyPass : public MachineFunctionPass { public: static char ID; // Pass identification, replacement for typeid - TwoAddressInstructionPass() : MachineFunctionPass(ID) { - initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry()); + TwoAddressInstructionLegacyPass() : MachineFunctionPass(ID) { + initializeTwoAddressInstructionLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + /// Pass entry point. + bool runOnMachineFunction(MachineFunction &MF) override { + TwoAddressInstructionImpl Impl(MF, this); + // Disable optimizations if requested. We cannot skip the whole pass as some + // fixups are necessary for correctness. + if (skipFunction(MF.getFunction())) + Impl.setOptLevel(CodeGenOptLevel::None); + return Impl.run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -203,26 +226,76 @@ class TwoAddressInstructionPass : public MachineFunctionPass { AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); } - - /// Pass entry point. - bool runOnMachineFunction(MachineFunction&) override; }; } // end anonymous namespace -char TwoAddressInstructionPass::ID = 0; +PreservedAnalyses +TwoAddressInstructionPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + // Disable optimizations if requested. We cannot skip the whole pass as some + // fixups are necessary for correctness. + TwoAddressInstructionImpl Impl(MF, MFAM); + if (MF.getFunction().hasOptNone()) + Impl.setOptLevel(CodeGenOptLevel::None); + + MFPropsModifier _(*this, MF); + bool Changed = Impl.run(); + if (!Changed) + return PreservedAnalyses::all(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserveSet(); + return PA; +} + +char TwoAddressInstructionLegacyPass::ID = 0; -char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; +char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionLegacyPass::ID; -INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE, - "Two-Address instruction pass", false, false) +INITIALIZE_PASS_BEGIN(TwoAddressInstructionLegacyPass, DEBUG_TYPE, + "Two-Address instruction pass", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE, - "Two-Address instruction pass", false, false) +INITIALIZE_PASS_END(TwoAddressInstructionLegacyPass, DEBUG_TYPE, + "Two-Address instruction pass", false, false) + +TwoAddressInstructionImpl::TwoAddressInstructionImpl( + MachineFunction &Func, MachineFunctionAnalysisManager &MFAM) + : MF(&Func), TII(Func.getSubtarget().getInstrInfo()), + TRI(Func.getSubtarget().getRegisterInfo()), + InstrItins(Func.getSubtarget().getInstrItineraryData()), + MRI(&Func.getRegInfo()), + LV(MFAM.getCachedResult(Func)), + LIS(MFAM.getCachedResult(Func)), + OptLevel(Func.getTarget().getOptLevel()) { + auto &FAM = MFAM.getResult(Func) + .getManager(); + AA = FAM.getCachedResult(Func.getFunction()); +} + +TwoAddressInstructionImpl::TwoAddressInstructionImpl(MachineFunction &Func, + MachineFunctionPass *P) + : MF(&Func), TII(Func.getSubtarget().getInstrInfo()), + TRI(Func.getSubtarget().getRegisterInfo()), + InstrItins(Func.getSubtarget().getInstrItineraryData()), + MRI(&Func.getRegInfo()), OptLevel(Func.getTarget().getOptLevel()) { + auto *LVWrapper = P->getAnalysisIfAvailable(); + LV = LVWrapper ? &LVWrapper->getLV() : nullptr; + auto *LISWrapper = P->getAnalysisIfAvailable(); + LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; + if (auto *AAPass = P->getAnalysisIfAvailable()) + AA = &AAPass->getAAResults(); + else + AA = nullptr; +} /// Return the MachineInstr* if it is the single def of the Reg in current BB. MachineInstr * -TwoAddressInstructionPass::getSingleDef(Register Reg, +TwoAddressInstructionImpl::getSingleDef(Register Reg, MachineBasicBlock *BB) const { MachineInstr *Ret = nullptr; for (MachineInstr &DefMI : MRI->def_instructions(Reg)) { @@ -243,7 +316,7 @@ TwoAddressInstructionPass::getSingleDef(Register Reg, /// %Tmp2 = copy %ToReg; /// MaxLen specifies the maximum length of the copy chain the func /// can walk through. -bool TwoAddressInstructionPass::isRevCopyChain(Register FromReg, Register ToReg, +bool TwoAddressInstructionImpl::isRevCopyChain(Register FromReg, Register ToReg, int Maxlen) { Register TmpReg = FromReg; for (int i = 0; i < Maxlen; i++) { @@ -263,7 +336,7 @@ bool TwoAddressInstructionPass::isRevCopyChain(Register FromReg, Register ToReg, /// in the MBB that defines the specified register and the two-address /// instruction which is being processed. It also returns the last def location /// by reference. -bool TwoAddressInstructionPass::noUseAfterLastDef(Register Reg, unsigned Dist, +bool TwoAddressInstructionImpl::noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef) { LastDef = 0; unsigned LastUse = Dist; @@ -286,7 +359,7 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(Register Reg, unsigned Dist, /// Return true if the specified MI is a copy instruction or an extract_subreg /// instruction. It also returns the source and destination registers and /// whether they are physical registers by reference. -bool TwoAddressInstructionPass::isCopyToReg(MachineInstr &MI, Register &SrcReg, +bool TwoAddressInstructionImpl::isCopyToReg(MachineInstr &MI, Register &SrcReg, Register &DstReg, bool &IsSrcPhys, bool &IsDstPhys) const { SrcReg = 0; @@ -306,7 +379,7 @@ bool TwoAddressInstructionPass::isCopyToReg(MachineInstr &MI, Register &SrcReg, return true; } -bool TwoAddressInstructionPass::isPlainlyKilled(const MachineInstr *MI, +bool TwoAddressInstructionImpl::isPlainlyKilled(const MachineInstr *MI, LiveRange &LR) const { // This is to match the kill flag version where undefs don't have kill flags. if (!LR.hasAtLeastOneValue()) @@ -320,7 +393,7 @@ bool TwoAddressInstructionPass::isPlainlyKilled(const MachineInstr *MI, /// Test if the given register value, which is used by the /// given instruction, is killed by the given instruction. -bool TwoAddressInstructionPass::isPlainlyKilled(const MachineInstr *MI, +bool TwoAddressInstructionImpl::isPlainlyKilled(const MachineInstr *MI, Register Reg) const { // FIXME: Sometimes tryInstructionTransform() will add instructions and // test whether they can be folded before keeping them. In this case it @@ -344,7 +417,7 @@ bool TwoAddressInstructionPass::isPlainlyKilled(const MachineInstr *MI, /// Test if the register used by the given operand is killed by the operand's /// instruction. -bool TwoAddressInstructionPass::isPlainlyKilled( +bool TwoAddressInstructionImpl::isPlainlyKilled( const MachineOperand &MO) const { return MO.isKill() || isPlainlyKilled(MO.getParent(), MO.getReg()); } @@ -366,7 +439,7 @@ bool TwoAddressInstructionPass::isPlainlyKilled( /// /// If allowFalsePositives is true then likely kills are treated as kills even /// if it can't be proven that they are kills. -bool TwoAddressInstructionPass::isKilled(MachineInstr &MI, Register Reg, +bool TwoAddressInstructionImpl::isKilled(MachineInstr &MI, Register Reg, bool allowFalsePositives) const { MachineInstr *DefMI = &MI; while (true) { @@ -411,7 +484,7 @@ static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) { /// Given a register, if all its uses are in the same basic block, return the /// last use instruction if it's a copy or a two-address use. -MachineInstr *TwoAddressInstructionPass::findOnlyInterestingUse( +MachineInstr *TwoAddressInstructionImpl::findOnlyInterestingUse( Register Reg, MachineBasicBlock *MBB, bool &IsCopy, Register &DstReg, bool &IsDstPhys) const { MachineOperand *UseOp = nullptr; @@ -468,7 +541,7 @@ static MCRegister getMappedReg(Register Reg, } /// Return true if the two registers are equal or aliased. -bool TwoAddressInstructionPass::regsAreCompatible(Register RegA, +bool TwoAddressInstructionImpl::regsAreCompatible(Register RegA, Register RegB) const { if (RegA == RegB) return true; @@ -478,7 +551,7 @@ bool TwoAddressInstructionPass::regsAreCompatible(Register RegA, } /// From RegMap remove entries mapped to a physical register which overlaps MO. -void TwoAddressInstructionPass::removeMapRegEntry( +void TwoAddressInstructionImpl::removeMapRegEntry( const MachineOperand &MO, DenseMap &RegMap) const { assert( (MO.isReg() || MO.isRegMask()) && @@ -510,7 +583,7 @@ void TwoAddressInstructionPass::removeMapRegEntry( /// /// After the MUL instruction, $rdx contains different value than in the COPY /// instruction. So %2 should not map to $rdx after MUL. -void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) { +void TwoAddressInstructionImpl::removeClobberedSrcRegMap(MachineInstr *MI) { if (MI->isCopy()) { // If a virtual register is copied to its mapped physical register, it // doesn't change the potential coalescing between them, so we don't remove @@ -546,7 +619,7 @@ void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) { } // Returns true if Reg is equal or aliased to at least one register in Set. -bool TwoAddressInstructionPass::regOverlapsSet( +bool TwoAddressInstructionImpl::regOverlapsSet( const SmallVectorImpl &Set, Register Reg) const { for (unsigned R : Set) if (TRI->regsOverlap(R, Reg)) @@ -557,7 +630,7 @@ bool TwoAddressInstructionPass::regOverlapsSet( /// Return true if it's potentially profitable to commute the two-address /// instruction that's being processed. -bool TwoAddressInstructionPass::isProfitableToCommute(Register RegA, +bool TwoAddressInstructionImpl::isProfitableToCommute(Register RegA, Register RegB, Register RegC, MachineInstr *MI, @@ -662,7 +735,7 @@ bool TwoAddressInstructionPass::isProfitableToCommute(Register RegA, /// Commute a two-address instruction and update the basic block, distance map, /// and live variables if needed. Return true if it is successful. -bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI, +bool TwoAddressInstructionImpl::commuteInstruction(MachineInstr *MI, unsigned DstIdx, unsigned RegBIdx, unsigned RegCIdx, @@ -693,7 +766,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI, /// Return true if it is profitable to convert the given 2-address instruction /// to a 3-address one. -bool TwoAddressInstructionPass::isProfitableToConv3Addr(Register RegA, +bool TwoAddressInstructionImpl::isProfitableToConv3Addr(Register RegA, Register RegB) { // Look for situations like this: // %reg1024 = MOV r1 @@ -710,7 +783,7 @@ bool TwoAddressInstructionPass::isProfitableToConv3Addr(Register RegA, /// Convert the specified two-address instruction into a three address one. /// Return true if this transformation was successful. -bool TwoAddressInstructionPass::convertInstTo3Addr( +bool TwoAddressInstructionImpl::convertInstTo3Addr( MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, Register RegA, Register RegB, unsigned &Dist) { MachineInstrSpan MIS(mi, MBB); @@ -752,7 +825,7 @@ bool TwoAddressInstructionPass::convertInstTo3Addr( /// Scan forward recursively for only uses, update maps if the use is a copy or /// a two-address instruction. -void TwoAddressInstructionPass::scanUses(Register DstReg) { +void TwoAddressInstructionImpl::scanUses(Register DstReg) { SmallVector VirtRegPairs; bool IsDstPhys; bool IsCopy = false; @@ -805,7 +878,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) { /// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is /// potentially joined with r1 on the output side. It's worthwhile to commute /// 'add' to eliminate a copy. -void TwoAddressInstructionPass::processCopy(MachineInstr *MI) { +void TwoAddressInstructionImpl::processCopy(MachineInstr *MI) { if (Processed.count(MI)) return; @@ -831,7 +904,7 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) { /// If there is one more local instruction that reads 'Reg' and it kills 'Reg, /// consider moving the instruction below the kill instruction in order to /// eliminate the need for the copy. -bool TwoAddressInstructionPass::rescheduleMIBelowKill( +bool TwoAddressInstructionImpl::rescheduleMIBelowKill( MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, Register Reg) { // Bail immediately if we don't have LV or LIS available. We use them to find @@ -998,7 +1071,7 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill( /// Return true if the re-scheduling will put the given instruction too close /// to the defs of its register dependencies. -bool TwoAddressInstructionPass::isDefTooClose(Register Reg, unsigned Dist, +bool TwoAddressInstructionImpl::isDefTooClose(Register Reg, unsigned Dist, MachineInstr *MI) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) { if (DefMI.getParent() != MBB || DefMI.isCopy() || DefMI.isCopyLike()) @@ -1019,7 +1092,7 @@ bool TwoAddressInstructionPass::isDefTooClose(Register Reg, unsigned Dist, /// If there is one more local instruction that reads 'Reg' and it kills 'Reg, /// consider moving the kill instruction above the current two-address /// instruction in order to eliminate the need for the copy. -bool TwoAddressInstructionPass::rescheduleKillAboveMI( +bool TwoAddressInstructionImpl::rescheduleKillAboveMI( MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, Register Reg) { // Bail immediately if we don't have LV or LIS available. We use them to find @@ -1171,7 +1244,7 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI( /// to commute operands in the instruction. /// /// Returns true if the transformation happened. Otherwise, returns false. -bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, +bool TwoAddressInstructionImpl::tryInstructionCommute(MachineInstr *MI, unsigned DstOpIdx, unsigned BaseOpIdx, bool BaseOpKilled, @@ -1236,11 +1309,9 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, /// (either because they were untied, or because mi was rescheduled, and will /// be visited again later). If the shouldOnlyCommute flag is true, only /// instruction commutation is attempted. -bool TwoAddressInstructionPass:: -tryInstructionTransform(MachineBasicBlock::iterator &mi, - MachineBasicBlock::iterator &nmi, - unsigned SrcIdx, unsigned DstIdx, - unsigned &Dist, bool shouldOnlyCommute) { +bool TwoAddressInstructionImpl::tryInstructionTransform( + MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, + unsigned SrcIdx, unsigned DstIdx, unsigned &Dist, bool shouldOnlyCommute) { if (OptLevel == CodeGenOptLevel::None) return false; @@ -1440,8 +1511,8 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, // Collect tied operands of MI that need to be handled. // Rewrite trivial cases immediately. // Return true if any tied operands where found, including the trivial ones. -bool TwoAddressInstructionPass:: -collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) { +bool TwoAddressInstructionImpl::collectTiedOperands( + MachineInstr *MI, TiedOperandMap &TiedOperands) { bool AnyOps = false; unsigned NumOps = MI->getNumOperands(); @@ -1479,10 +1550,9 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) { // Process a list of tied MI operands that all use the same source register. // The tied pairs are of the form (SrcIdx, DstIdx). -void -TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, - TiedPairList &TiedPairs, - unsigned &Dist) { +void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI, + TiedPairList &TiedPairs, + unsigned &Dist) { bool IsEarlyClobber = llvm::any_of(TiedPairs, [MI](auto const &TP) { return MI->getOperand(TP.second).isEarlyClobber(); }); @@ -1668,7 +1738,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, // and replaces all uses of RegA with RegB. // No extra COPY instruction is necessary because tied use is killed at // STATEPOINT. -bool TwoAddressInstructionPass::processStatepoint( +bool TwoAddressInstructionImpl::processStatepoint( MachineInstr *MI, TiedOperandMap &TiedOperands) { bool NeedCopy = false; @@ -1755,27 +1825,7 @@ bool TwoAddressInstructionPass::processStatepoint( } /// Reduce two-address instructions to two operands. -bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { - MF = &Func; - const TargetMachine &TM = MF->getTarget(); - MRI = &MF->getRegInfo(); - TII = MF->getSubtarget().getInstrInfo(); - TRI = MF->getSubtarget().getRegisterInfo(); - InstrItins = MF->getSubtarget().getInstrItineraryData(); - auto *LVWrapper = getAnalysisIfAvailable(); - LV = LVWrapper ? &LVWrapper->getLV() : nullptr; - auto *LISWrapper = getAnalysisIfAvailable(); - LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; - if (auto *AAPass = getAnalysisIfAvailable()) - AA = &AAPass->getAAResults(); - else - AA = nullptr; - OptLevel = TM.getOptLevel(); - // Disable optimizations if requested. We cannot skip the whole pass as some - // fixups are necessary for correctness. - if (skipFunction(Func.getFunction())) - OptLevel = CodeGenOptLevel::None; - +bool TwoAddressInstructionImpl::run() { bool MadeChange = false; LLVM_DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n"); @@ -1930,8 +1980,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { /// /// undef %dst:ssub0 = COPY %v1 /// %dst:ssub1 = COPY %v2 -void TwoAddressInstructionPass:: -eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { +void TwoAddressInstructionImpl::eliminateRegSequence( + MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index c6312c387744aa..7510326f2e1b34 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -77,7 +77,15 @@ DWARFDie DWARFLinker::resolveDIEReference(const DWARFFile &File, const DWARFDie &DIE, CompileUnit *&RefCU) { assert(RefValue.isFormClass(DWARFFormValue::FC_Reference)); - uint64_t RefOffset = *RefValue.getAsReference(); + uint64_t RefOffset; + if (std::optional Off = RefValue.getAsRelativeReference()) { + RefOffset = RefValue.getUnit()->getOffset() + *Off; + } else if (Off = RefValue.getAsDebugInfoReference(); Off) { + RefOffset = *Off; + } else { + reportWarning("Unsupported reference type", File, &DIE); + return DWARFDie(); + } if ((RefCU = getUnitForOffset(Units, RefOffset))) if (const auto RefDie = RefCU->getOrigUnit().getDIEForOffset(RefOffset)) { // In a file with broken references, an attribute might point to a NULL @@ -1073,7 +1081,13 @@ unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute( unsigned AttrSize, const DWARFFormValue &Val, const DWARFFile &File, CompileUnit &Unit) { const DWARFUnit &U = Unit.getOrigUnit(); - uint64_t Ref = *Val.getAsReference(); + uint64_t Ref; + if (std::optional Off = Val.getAsRelativeReference()) + Ref = Val.getUnit()->getOffset() + *Off; + else if (Off = Val.getAsDebugInfoReference(); Off) + Ref = *Off; + else + return 0; DIE *NewRefDie = nullptr; CompileUnit *RefUnit = nullptr; diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp index 6f659eb8576b79..4daf781a2b53fa 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp @@ -381,38 +381,36 @@ void CompileUnit::updateDieRefPatchesWithClonedOffsets() { std::optional CompileUnit::resolveDIEReference( const DWARFFormValue &RefValue, ResolveInterCUReferencesMode CanResolveInterCUReferences) { - if (std::optional Ref = - *RefValue.getAsRelativeReference()) { - if (Ref->Unit == OrigUnit) { - // Referenced DIE is in current compile unit. - if (std::optional RefDieIdx = - getDIEIndexForOffset(OrigUnit->getOffset() + Ref->Offset)) - return UnitEntryPairTy{this, OrigUnit->getDebugInfoEntry(*RefDieIdx)}; - } - uint64_t RefDIEOffset = - Ref->Unit ? Ref->Unit->getOffset() + Ref->Offset : Ref->Offset; - if (CompileUnit *RefCU = getUnitFromOffset(RefDIEOffset)) { - if (RefCU == this) { - // Referenced DIE is in current compile unit. - if (std::optional RefDieIdx = - getDIEIndexForOffset(RefDIEOffset)) - return UnitEntryPairTy{this, getDebugInfoEntry(*RefDieIdx)}; - } else if (CanResolveInterCUReferences) { - // Referenced DIE is in other compile unit. - - // Check whether DIEs are loaded for that compile unit. - enum Stage ReferredCUStage = RefCU->getStage(); - if (ReferredCUStage < Stage::Loaded || ReferredCUStage > Stage::Cloned) - return UnitEntryPairTy{RefCU, nullptr}; - - if (std::optional RefDieIdx = - RefCU->getDIEIndexForOffset(RefDIEOffset)) - return UnitEntryPairTy{RefCU, RefCU->getDebugInfoEntry(*RefDieIdx)}; - } else - return UnitEntryPairTy{RefCU, nullptr}; - } + CompileUnit *RefCU; + uint64_t RefDIEOffset; + if (std::optional Offset = RefValue.getAsRelativeReference()) { + RefCU = this; + RefDIEOffset = RefValue.getUnit()->getOffset() + *Offset; + } else if (Offset = RefValue.getAsDebugInfoReference(); Offset) { + RefCU = getUnitFromOffset(*Offset); + RefDIEOffset = *Offset; + } else { + return std::nullopt; } + if (RefCU == this) { + // Referenced DIE is in current compile unit. + if (std::optional RefDieIdx = getDIEIndexForOffset(RefDIEOffset)) + return UnitEntryPairTy{this, getDebugInfoEntry(*RefDieIdx)}; + } else if (RefCU && CanResolveInterCUReferences) { + // Referenced DIE is in other compile unit. + + // Check whether DIEs are loaded for that compile unit. + enum Stage ReferredCUStage = RefCU->getStage(); + if (ReferredCUStage < Stage::Loaded || ReferredCUStage > Stage::Cloned) + return UnitEntryPairTy{RefCU, nullptr}; + + if (std::optional RefDieIdx = + RefCU->getDIEIndexForOffset(RefDIEOffset)) + return UnitEntryPairTy{RefCU, RefCU->getDebugInfoEntry(*RefDieIdx)}; + } else { + return UnitEntryPairTy{RefCU, nullptr}; + } return std::nullopt; } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 410842a80b0151..72e7464b689716 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -313,13 +313,12 @@ DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const { DWARFDie DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const { DWARFDie Result; - if (auto SpecRef = V.getAsRelativeReference()) { - if (SpecRef->Unit) - Result = SpecRef->Unit->getDIEForOffset(SpecRef->Unit->getOffset() + - SpecRef->Offset); - else if (auto SpecUnit = - U->getUnitVector().getUnitForOffset(SpecRef->Offset)) - Result = SpecUnit->getDIEForOffset(SpecRef->Offset); + if (std::optional Offset = V.getAsRelativeReference()) { + Result = const_cast(V.getUnit()) + ->getDIEForOffset(V.getUnit()->getOffset() + *Offset); + } else if (Offset = V.getAsDebugInfoReference(); Offset) { + if (DWARFUnit *SpecUnit = U->getUnitVector().getUnitForOffset(*Offset)) + Result = SpecUnit->getDIEForOffset(*Offset); } return Result; } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp index b9cf7d22c80d4b..bc4badc7713802 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -665,16 +665,7 @@ DWARFFormValue::getAsSectionedAddress() const { return getAsSectionedAddress(Value, Form, U); } -std::optional DWARFFormValue::getAsReference() const { - if (auto R = getAsRelativeReference()) - return R->Unit ? R->Unit->getOffset() + R->Offset : R->Offset; - return std::nullopt; -} - -std::optional -DWARFFormValue::getAsRelativeReference() const { - if (!isFormClass(FC_Reference)) - return std::nullopt; +std::optional DWARFFormValue::getAsRelativeReference() const { switch (Form) { case DW_FORM_ref1: case DW_FORM_ref2: @@ -683,11 +674,30 @@ DWARFFormValue::getAsRelativeReference() const { case DW_FORM_ref_udata: if (!U) return std::nullopt; - return UnitOffset{const_cast(U), Value.uval}; - case DW_FORM_ref_addr: - case DW_FORM_ref_sig8: + return Value.uval; + default: + return std::nullopt; + } +} + +std::optional DWARFFormValue::getAsDebugInfoReference() const { + if (Form == DW_FORM_ref_addr) + return Value.uval; + return std::nullopt; +} + +std::optional DWARFFormValue::getAsSignatureReference() const { + if (Form == DW_FORM_ref_sig8) + return Value.uval; + return std::nullopt; +} + +std::optional DWARFFormValue::getAsSupplementaryReference() const { + switch (Form) { case DW_FORM_GNU_ref_alt: - return UnitOffset{nullptr, Value.uval}; + case DW_FORM_ref_sup4: + case DW_FORM_ref_sup8: + return Value.uval; default: return std::nullopt; } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 4ef6c80ed0289d..a804deb446186d 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -836,7 +836,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, case DW_FORM_ref8: case DW_FORM_ref_udata: { // Verify all CU relative references are valid CU offsets. - std::optional RefVal = AttrValue.Value.getAsReference(); + std::optional RefVal = AttrValue.Value.getAsRelativeReference(); assert(RefVal); if (RefVal) { auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset(); @@ -854,7 +854,8 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, } else { // Valid reference, but we will verify it points to an actual // DIE later. - LocalReferences[*RefVal].insert(Die.getOffset()); + LocalReferences[AttrValue.Value.getUnit()->getOffset() + *RefVal] + .insert(Die.getOffset()); } } break; @@ -862,7 +863,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, case DW_FORM_ref_addr: { // Verify all absolute DIE references have valid offsets in the // .debug_info section. - std::optional RefVal = AttrValue.Value.getAsReference(); + std::optional RefVal = AttrValue.Value.getAsDebugInfoReference(); assert(RefVal); if (RefVal) { if (*RefVal >= DieCU->getInfoSection().Data.size()) { diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index 6a97bed9e3a838..68a14b8b0ad33e 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -1082,10 +1082,17 @@ void LVDWARFReader::updateReference(dwarf::Attribute Attr, // FIXME: We are assuming that at most one Reference (DW_AT_specification, // DW_AT_abstract_origin, ...) and at most one Type (DW_AT_import, DW_AT_type) // appear in any single DIE, but this may not be true. - uint64_t Reference = *FormValue.getAsReference(); + uint64_t Offset; + if (std::optional Off = FormValue.getAsRelativeReference()) + Offset = FormValue.getUnit()->getOffset() + *Off; + else if (Off = FormValue.getAsDebugInfoReference(); Off) + Offset = *Off; + else + llvm_unreachable("Unsupported reference type"); + // Get target for the given reference, if already created. LVElement *Target = getElementForOffset( - Reference, CurrentElement, + Offset, CurrentElement, /*IsType=*/Attr == dwarf::DW_AT_import || Attr == dwarf::DW_AT_type); // Check if we are dealing with cross CU references. if (FormValue.getForm() == dwarf::DW_FORM_ref_addr) { @@ -1093,10 +1100,10 @@ void LVDWARFReader::updateReference(dwarf::Attribute Attr, // The global reference is ready. Mark it as global. Target->setIsGlobalReference(); // Remove global reference from the unseen list. - removeGlobalOffset(Reference); + removeGlobalOffset(Offset); } else // Record the unseen cross CU reference. - addGlobalOffset(Reference); + addGlobalOffset(Offset); } // At this point, 'Target' can be null, in the case of the target element diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 6b3224e22ffb6f..8bf199f0f44c91 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -73,6 +73,7 @@ add_llvm_component_library(LLVMCore VectorBuilder.cpp Verifier.cpp VFABIDemangler.cpp + RuntimeLibcalls.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/IR @@ -81,6 +82,7 @@ add_llvm_component_library(LLVMCore ${LLVM_PTHREAD_LIB} DEPENDS + vt_gen intrinsics_gen LINK_COMPONENTS diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 9ba78731060439..f7b0fc5dd6b6c2 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -1685,6 +1685,32 @@ static int map_from_llvmopcode(LLVMOpcode code) llvm_unreachable("Unhandled Opcode."); } +/*-- GEP wrap flag conversions */ + +static GEPNoWrapFlags mapFromLLVMGEPNoWrapFlags(LLVMGEPNoWrapFlags GEPFlags) { + GEPNoWrapFlags NewGEPFlags; + if ((GEPFlags & LLVMGEPFlagInBounds) != 0) + NewGEPFlags |= GEPNoWrapFlags::inBounds(); + if ((GEPFlags & LLVMGEPFlagNUSW) != 0) + NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap(); + if ((GEPFlags & LLVMGEPFlagNUW) != 0) + NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap(); + + return NewGEPFlags; +} + +static LLVMGEPNoWrapFlags mapToLLVMGEPNoWrapFlags(GEPNoWrapFlags GEPFlags) { + LLVMGEPNoWrapFlags NewGEPFlags = 0; + if (GEPFlags.isInBounds()) + NewGEPFlags |= LLVMGEPFlagInBounds; + if (GEPFlags.hasNoUnsignedSignedWrap()) + NewGEPFlags |= LLVMGEPFlagNUSW; + if (GEPFlags.hasNoUnsignedWrap()) + NewGEPFlags |= LLVMGEPFlagNUW; + + return NewGEPFlags; +} + /*--.. Constant expressions ................................................--*/ LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) { @@ -1789,6 +1815,18 @@ LLVMValueRef LLVMConstInBoundsGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal, return wrap(ConstantExpr::getInBoundsGetElementPtr(unwrap(Ty), Val, IdxList)); } +LLVMValueRef LLVMConstGEPWithNoWrapFlags(LLVMTypeRef Ty, + LLVMValueRef ConstantVal, + LLVMValueRef *ConstantIndices, + unsigned NumIndices, + LLVMGEPNoWrapFlags NoWrapFlags) { + ArrayRef IdxList(unwrap(ConstantIndices, NumIndices), + NumIndices); + Constant *Val = unwrap(ConstantVal); + return wrap(ConstantExpr::getGetElementPtr( + unwrap(Ty), Val, IdxList, mapFromLLVMGEPNoWrapFlags(NoWrapFlags))); +} + LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getTrunc(unwrap(ConstantVal), unwrap(ToType))); @@ -3102,6 +3140,16 @@ LLVMTypeRef LLVMGetGEPSourceElementType(LLVMValueRef GEP) { return wrap(unwrap(GEP)->getSourceElementType()); } +LLVMGEPNoWrapFlags LLVMGEPGetNoWrapFlags(LLVMValueRef GEP) { + GEPOperator *GEPOp = unwrap(GEP); + return mapToLLVMGEPNoWrapFlags(GEPOp->getNoWrapFlags()); +} + +void LLVMGEPSetNoWrapFlags(LLVMValueRef GEP, LLVMGEPNoWrapFlags NoWrapFlags) { + GetElementPtrInst *GEPInst = unwrap(GEP); + GEPInst->setNoWrapFlags(mapFromLLVMGEPNoWrapFlags(NoWrapFlags)); +} + /*--.. Operations on phi nodes .............................................--*/ void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues, @@ -3902,6 +3950,16 @@ LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, unwrap(B)->CreateInBoundsGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name)); } +LLVMValueRef LLVMBuildGEPWithNoWrapFlags(LLVMBuilderRef B, LLVMTypeRef Ty, + LLVMValueRef Pointer, + LLVMValueRef *Indices, + unsigned NumIndices, const char *Name, + LLVMGEPNoWrapFlags NoWrapFlags) { + ArrayRef IdxList(unwrap(Indices), NumIndices); + return wrap(unwrap(B)->CreateGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name, + mapFromLLVMGEPNoWrapFlags(NoWrapFlags))); +} + LLVMValueRef LLVMBuildStructGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, unsigned Idx, const char *Name) { diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 5f42ce22f72fec..3aec7140510a6d 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1590,7 +1590,7 @@ void Instruction::dropUnknownNonDebugMetadata(ArrayRef KnownIDs) { if (!Value::hasMetadata()) return; // Nothing to remove! - SmallSet KnownSet; + SmallSet KnownSet; KnownSet.insert(KnownIDs.begin(), KnownIDs.end()); // A DIAssignID attachment is debug metadata, don't drop it. diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp new file mode 100644 index 00000000000000..de3db557d8b50b --- /dev/null +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -0,0 +1,379 @@ +//===- RuntimeLibcalls.cpp - Interface for runtime libcalls -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/RuntimeLibcalls.h" + +using namespace llvm; +using namespace RTLIB; + +/// Set default libcall names. If a target wants to opt-out of a libcall it +/// should be placed here. +void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { + std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), + nullptr); + +#define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name); +#include "llvm/IR/RuntimeLibcalls.def" +#undef HANDLE_LIBCALL + + // Initialize calling conventions to their default. + for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) + setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C); + + // Use the f128 variants of math functions on x86_64 + if (TT.getArch() == Triple::ArchType::x86_64 && TT.isGNUEnvironment()) { + setLibcallName(RTLIB::REM_F128, "fmodf128"); + setLibcallName(RTLIB::FMA_F128, "fmaf128"); + setLibcallName(RTLIB::SQRT_F128, "sqrtf128"); + setLibcallName(RTLIB::CBRT_F128, "cbrtf128"); + setLibcallName(RTLIB::LOG_F128, "logf128"); + setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite"); + setLibcallName(RTLIB::LOG2_F128, "log2f128"); + setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite"); + setLibcallName(RTLIB::LOG10_F128, "log10f128"); + setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite"); + setLibcallName(RTLIB::EXP_F128, "expf128"); + setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite"); + setLibcallName(RTLIB::EXP2_F128, "exp2f128"); + setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite"); + setLibcallName(RTLIB::EXP10_F128, "exp10f128"); + setLibcallName(RTLIB::SIN_F128, "sinf128"); + setLibcallName(RTLIB::COS_F128, "cosf128"); + setLibcallName(RTLIB::TAN_F128, "tanf128"); + setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); + setLibcallName(RTLIB::ASIN_F128, "asinf128"); + setLibcallName(RTLIB::ACOS_F128, "acosf128"); + setLibcallName(RTLIB::ATAN_F128, "atanf128"); + setLibcallName(RTLIB::SINH_F128, "sinhf128"); + setLibcallName(RTLIB::COSH_F128, "coshf128"); + setLibcallName(RTLIB::TANH_F128, "tanhf128"); + setLibcallName(RTLIB::POW_F128, "powf128"); + setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite"); + setLibcallName(RTLIB::CEIL_F128, "ceilf128"); + setLibcallName(RTLIB::TRUNC_F128, "truncf128"); + setLibcallName(RTLIB::RINT_F128, "rintf128"); + setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128"); + setLibcallName(RTLIB::ROUND_F128, "roundf128"); + setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128"); + setLibcallName(RTLIB::FLOOR_F128, "floorf128"); + setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128"); + setLibcallName(RTLIB::FMIN_F128, "fminf128"); + setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); + setLibcallName(RTLIB::LROUND_F128, "lroundf128"); + setLibcallName(RTLIB::LLROUND_F128, "llroundf128"); + setLibcallName(RTLIB::LRINT_F128, "lrintf128"); + setLibcallName(RTLIB::LLRINT_F128, "llrintf128"); + setLibcallName(RTLIB::LDEXP_F128, "ldexpf128"); + setLibcallName(RTLIB::FREXP_F128, "frexpf128"); + } + + // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf". + if (TT.isPPC()) { + setLibcallName(RTLIB::ADD_F128, "__addkf3"); + setLibcallName(RTLIB::SUB_F128, "__subkf3"); + setLibcallName(RTLIB::MUL_F128, "__mulkf3"); + setLibcallName(RTLIB::DIV_F128, "__divkf3"); + setLibcallName(RTLIB::POWI_F128, "__powikf2"); + setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2"); + setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2"); + setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2"); + setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2"); + setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi"); + setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi"); + setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti"); + setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi"); + setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi"); + setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti"); + setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf"); + setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf"); + setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf"); + setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf"); + setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf"); + setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf"); + setLibcallName(RTLIB::OEQ_F128, "__eqkf2"); + setLibcallName(RTLIB::UNE_F128, "__nekf2"); + setLibcallName(RTLIB::OGE_F128, "__gekf2"); + setLibcallName(RTLIB::OLT_F128, "__ltkf2"); + setLibcallName(RTLIB::OLE_F128, "__lekf2"); + setLibcallName(RTLIB::OGT_F128, "__gtkf2"); + setLibcallName(RTLIB::UO_F128, "__unordkf2"); + } + + // A few names are different on particular architectures or environments. + if (TT.isOSDarwin()) { + // For f16/f32 conversions, Darwin uses the standard naming scheme, + // instead of the gnueabi-style __gnu_*_ieee. + // FIXME: What about other targets? + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + + // Some darwins have an optimized __bzero/bzero function. + switch (TT.getArch()) { + case Triple::x86: + case Triple::x86_64: + if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6)) + setLibcallName(RTLIB::BZERO, "__bzero"); + break; + case Triple::aarch64: + case Triple::aarch64_32: + setLibcallName(RTLIB::BZERO, "bzero"); + break; + default: + break; + } + + if (darwinHasSinCos(TT)) { + setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret"); + setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret"); + if (TT.isWatchABI()) { + setLibcallCallingConv(RTLIB::SINCOS_STRET_F32, + CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_STRET_F64, + CallingConv::ARM_AAPCS_VFP); + } + } + + switch (TT.getOS()) { + case Triple::MacOSX: + if (TT.isMacOSXVersionLT(10, 9)) { + setLibcallName(RTLIB::EXP10_F32, nullptr); + setLibcallName(RTLIB::EXP10_F64, nullptr); + } else { + setLibcallName(RTLIB::EXP10_F32, "__exp10f"); + setLibcallName(RTLIB::EXP10_F64, "__exp10"); + } + break; + case Triple::IOS: + if (TT.isOSVersionLT(7, 0)) { + setLibcallName(RTLIB::EXP10_F32, nullptr); + setLibcallName(RTLIB::EXP10_F64, nullptr); + break; + } + [[fallthrough]]; + case Triple::TvOS: + case Triple::WatchOS: + case Triple::XROS: + setLibcallName(RTLIB::EXP10_F32, "__exp10f"); + setLibcallName(RTLIB::EXP10_F64, "__exp10"); + break; + default: + break; + } + } else { + setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); + } + + if (TT.isGNUEnvironment() || TT.isOSFuchsia() || + (TT.isAndroid() && !TT.isAndroidVersionLT(9))) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + setLibcallName(RTLIB::SINCOS_F80, "sincosl"); + setLibcallName(RTLIB::SINCOS_F128, "sincosl"); + setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl"); + } + + if (TT.isPS()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + } + + if (TT.isOSOpenBSD()) { + setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); + } + + if (TT.isOSWindows() && !TT.isOSCygMing()) { + setLibcallName(RTLIB::LDEXP_F32, nullptr); + setLibcallName(RTLIB::LDEXP_F80, nullptr); + setLibcallName(RTLIB::LDEXP_F128, nullptr); + setLibcallName(RTLIB::LDEXP_PPCF128, nullptr); + + setLibcallName(RTLIB::FREXP_F32, nullptr); + setLibcallName(RTLIB::FREXP_F80, nullptr); + setLibcallName(RTLIB::FREXP_F128, nullptr); + setLibcallName(RTLIB::FREXP_PPCF128, nullptr); + } + + if (TT.isAArch64()) { + if (TT.isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); + } + } + + // Disable most libcalls on AMDGPU. + if (TT.isAMDGPU()) { + for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { + if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) + setLibcallName(static_cast(I), nullptr); + } + } + + // Disable most libcalls on NVPTX. + if (TT.isNVPTX()) { + for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) + if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) + setLibcallName(static_cast(I), nullptr); + } + + if (TT.isARM() || TT.isThumb()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + setLibcallName(RTLIB::MULO_I64, nullptr); + setLibcallName(RTLIB::MULO_I128, nullptr); + + if (TT.isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); + } + } + + if (TT.getArch() == Triple::ArchType::avr) { + // Division rtlib functions (not supported), use divmod functions instead + setLibcallName(RTLIB::SDIV_I8, nullptr); + setLibcallName(RTLIB::SDIV_I16, nullptr); + setLibcallName(RTLIB::SDIV_I32, nullptr); + setLibcallName(RTLIB::UDIV_I8, nullptr); + setLibcallName(RTLIB::UDIV_I16, nullptr); + setLibcallName(RTLIB::UDIV_I32, nullptr); + + // Modulus rtlib functions (not supported), use divmod functions instead + setLibcallName(RTLIB::SREM_I8, nullptr); + setLibcallName(RTLIB::SREM_I16, nullptr); + setLibcallName(RTLIB::SREM_I32, nullptr); + setLibcallName(RTLIB::UREM_I8, nullptr); + setLibcallName(RTLIB::UREM_I16, nullptr); + setLibcallName(RTLIB::UREM_I32, nullptr); + } + + if (TT.getArch() == Triple::ArchType::hexagon) { + // These cause problems when the shift amount is non-constant. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + + if (TT.isLoongArch()) { + if (!TT.isLoongArch64()) { + // Set libcalls. + setLibcallName(RTLIB::MUL_I128, nullptr); + // The MULO libcall is not part of libgcc, only compiler-rt. + setLibcallName(RTLIB::MULO_I64, nullptr); + } + // The MULO libcall is not part of libgcc, only compiler-rt. + setLibcallName(RTLIB::MULO_I128, nullptr); + } + + if (TT.isMIPS32()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + setLibcallName(RTLIB::MULO_I64, nullptr); + setLibcallName(RTLIB::MULO_I128, nullptr); + } + + if (TT.isPPC()) { + if (!TT.isPPC64()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + setLibcallName(RTLIB::MULO_I64, nullptr); + } + setLibcallName(RTLIB::MULO_I128, nullptr); + } + + if (TT.isRISCV32()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + setLibcallName(RTLIB::MULO_I64, nullptr); + } + + if (TT.isSPARC()) { + if (!TT.isSPARC64()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::MULO_I64, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + setLibcallName(RTLIB::MULO_I128, nullptr); + } + + if (TT.isSystemZ()) { + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + + if (TT.isX86()) { + if (TT.getArch() == Triple::ArchType::x86) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); + // The MULO libcall is not part of libgcc, only compiler-rt. + setLibcallName(RTLIB::MULO_I64, nullptr); + } + + // The MULO libcall is not part of libgcc, only compiler-rt. + setLibcallName(RTLIB::MULO_I128, nullptr); + + if (TT.isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); + } + } +} + +void RuntimeLibcallsInfo::initCmpLibcallCCs() { + std::fill(CmpLibcallCCs, CmpLibcallCCs + RTLIB::UNKNOWN_LIBCALL, + ISD::SETCC_INVALID); + CmpLibcallCCs[RTLIB::OEQ_F32] = ISD::SETEQ; + CmpLibcallCCs[RTLIB::OEQ_F64] = ISD::SETEQ; + CmpLibcallCCs[RTLIB::OEQ_F128] = ISD::SETEQ; + CmpLibcallCCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ; + CmpLibcallCCs[RTLIB::UNE_F32] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UNE_F64] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UNE_F128] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UNE_PPCF128] = ISD::SETNE; + CmpLibcallCCs[RTLIB::OGE_F32] = ISD::SETGE; + CmpLibcallCCs[RTLIB::OGE_F64] = ISD::SETGE; + CmpLibcallCCs[RTLIB::OGE_F128] = ISD::SETGE; + CmpLibcallCCs[RTLIB::OGE_PPCF128] = ISD::SETGE; + CmpLibcallCCs[RTLIB::OLT_F32] = ISD::SETLT; + CmpLibcallCCs[RTLIB::OLT_F64] = ISD::SETLT; + CmpLibcallCCs[RTLIB::OLT_F128] = ISD::SETLT; + CmpLibcallCCs[RTLIB::OLT_PPCF128] = ISD::SETLT; + CmpLibcallCCs[RTLIB::OLE_F32] = ISD::SETLE; + CmpLibcallCCs[RTLIB::OLE_F64] = ISD::SETLE; + CmpLibcallCCs[RTLIB::OLE_F128] = ISD::SETLE; + CmpLibcallCCs[RTLIB::OLE_PPCF128] = ISD::SETLE; + CmpLibcallCCs[RTLIB::OGT_F32] = ISD::SETGT; + CmpLibcallCCs[RTLIB::OGT_F64] = ISD::SETGT; + CmpLibcallCCs[RTLIB::OGT_F128] = ISD::SETGT; + CmpLibcallCCs[RTLIB::OGT_PPCF128] = ISD::SETGT; + CmpLibcallCCs[RTLIB::UO_F32] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UO_F64] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UO_F128] = ISD::SETNE; + CmpLibcallCCs[RTLIB::UO_PPCF128] = ISD::SETNE; +} diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index c07bc0561fba93..5ff30828798950 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -57,7 +57,70 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy, auto VPID = VPIntrinsic::getForOpcode(Opcode); if (VPID == Intrinsic::not_intrinsic) return returnWithError("No VPIntrinsic for this opcode"); + return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name); +} + +Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy, + ArrayRef InstOpArray, + const Twine &Name) { + Intrinsic::ID VPID; + switch (Kind) { + case RecurKind::Add: + VPID = Intrinsic::vp_reduce_add; + break; + case RecurKind::Mul: + VPID = Intrinsic::vp_reduce_mul; + break; + case RecurKind::And: + VPID = Intrinsic::vp_reduce_and; + break; + case RecurKind::Or: + VPID = Intrinsic::vp_reduce_or; + break; + case RecurKind::Xor: + VPID = Intrinsic::vp_reduce_xor; + break; + case RecurKind::FMulAdd: + case RecurKind::FAdd: + VPID = Intrinsic::vp_reduce_fadd; + break; + case RecurKind::FMul: + VPID = Intrinsic::vp_reduce_fmul; + break; + case RecurKind::SMax: + VPID = Intrinsic::vp_reduce_smax; + break; + case RecurKind::SMin: + VPID = Intrinsic::vp_reduce_smin; + break; + case RecurKind::UMax: + VPID = Intrinsic::vp_reduce_umax; + break; + case RecurKind::UMin: + VPID = Intrinsic::vp_reduce_umin; + break; + case RecurKind::FMax: + VPID = Intrinsic::vp_reduce_fmax; + break; + case RecurKind::FMin: + VPID = Intrinsic::vp_reduce_fmin; + break; + case RecurKind::FMaximum: + VPID = Intrinsic::vp_reduce_fmaximum; + break; + case RecurKind::FMinimum: + VPID = Intrinsic::vp_reduce_fminimum; + break; + default: + llvm_unreachable("No VPIntrinsic for this reduction"); + } + return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name); +} +Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID, + Type *ReturnTy, + ArrayRef InstOpArray, + const Twine &Name) { auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID); auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID); size_t NumInstParams = InstOpArray.size(); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 4ffacbb9f0ffd7..d303f228aa72c3 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/LTO/LTOBackend.h" #include "llvm/LTO/SummaryBasedOptimizations.h" #include "llvm/Linker/IRMover.h" @@ -1357,14 +1358,13 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); } -static const char *libcallRoutineNames[] = { -#define HANDLE_LIBCALL(code, name) name, -#include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL -}; +SmallVector LTO::getRuntimeLibcallSymbols(const Triple &TT) { + RTLIB::RuntimeLibcallsInfo Libcalls(TT); -ArrayRef LTO::getRuntimeLibcallSymbols() { - return ArrayRef(libcallRoutineNames); + SmallVector LibcallSymbols; + copy_if(Libcalls.getLibcallNames(), std::back_inserter(LibcallSymbols), + [](const char *Name) { return Name; }); + return LibcallSymbols; } /// This class defines the interface to the ThinLTO backend. diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index ba8f371127764b..beab194a52b817 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1192,8 +1192,8 @@ void IRLinker::prepareCompileUnitsForImport() { // When importing for ThinLTO, prevent importing of types listed on // the DICompileUnit that we don't need a copy of in the importing // module. They will be emitted by the originating module. - for (unsigned I = 0, E = SrcCompileUnits->getNumOperands(); I != E; ++I) { - auto *CU = cast(SrcCompileUnits->getOperand(I)); + for (MDNode *N : SrcCompileUnits->operands()) { + auto *CU = cast(N); assert(CU && "Expected valid compile unit"); // Enums, macros, and retained types don't need to be listed on the // imported DICompileUnit. This means they will only be imported diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 42d9b7056e9eb7..813b1194b47cbf 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -49,6 +49,7 @@ MCOPT(bool, NoTypeCheck) MCOPT(bool, SaveTempLabels) MCOPT(bool, Crel) MCOPT(bool, X86RelaxRelocations) +MCOPT(bool, X86Sse2Avx) MCOPT(std::string, ABIName) MCOPT(std::string, AsSecureLogFile) @@ -140,6 +141,11 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { cl::init(true)); MCBINDOPT(X86RelaxRelocations); + static cl::opt X86Sse2Avx( + "x86-sse2avx", cl::desc("Specify that the assembler should encode SSE " + "instructions with VEX prefix")); + MCBINDOPT(X86Sse2Avx); + static cl::opt ABIName( "target-abi", cl::Hidden, cl::desc("The name of the ABI to be targeted from the backend."), @@ -169,6 +175,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() { Options.MCSaveTempLabels = getSaveTempLabels(); Options.Crel = getCrel(); Options.X86RelaxRelocations = getX86RelaxRelocations(); + Options.X86Sse2Avx = getX86Sse2Avx(); Options.EmitDwarfUnwind = getEmitDwarfUnwind(); Options.EmitCompactUnwindNonCanonical = getEmitCompactUnwindNonCanonical(); Options.AsSecureLogFile = getAsSecureLogFile(); diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp index a9f150a965c357..2458a53cb6d548 100644 --- a/llvm/lib/Object/COFFImportFile.cpp +++ b/llvm/lib/Object/COFFImportFile.cpp @@ -12,6 +12,8 @@ #include "llvm/Object/COFFImportFile.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/Twine.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" @@ -52,18 +54,12 @@ StringRef COFFImportFile::getFileFormatName() const { } } -StringRef COFFImportFile::getExportName() const { - const coff_import_header *hdr = getCOFFImportHeader(); - StringRef name = Data.getBuffer().substr(sizeof(*hdr)).split('\0').first; - +static StringRef applyNameType(ImportNameType Type, StringRef name) { auto ltrim1 = [](StringRef s, StringRef chars) { return !s.empty() && chars.contains(s[0]) ? s.substr(1) : s; }; - switch (hdr->getNameType()) { - case IMPORT_ORDINAL: - name = ""; - break; + switch (Type) { case IMPORT_NAME_NOPREFIX: name = ltrim1(name, "?@_"); break; @@ -71,6 +67,24 @@ StringRef COFFImportFile::getExportName() const { name = ltrim1(name, "?@_"); name = name.substr(0, name.find('@')); break; + default: + break; + } + return name; +} + +StringRef COFFImportFile::getExportName() const { + const coff_import_header *hdr = getCOFFImportHeader(); + StringRef name = Data.getBuffer().substr(sizeof(*hdr)).split('\0').first; + + switch (hdr->getNameType()) { + case IMPORT_ORDINAL: + name = ""; + break; + case IMPORT_NAME_NOPREFIX: + case IMPORT_NAME_UNDECORATE: + name = applyNameType(static_cast(hdr->getNameType()), name); + break; case IMPORT_NAME_EXPORTAS: { // Skip DLL name name = Data.getBuffer().substr(sizeof(*hdr) + name.size() + 1); @@ -667,6 +681,13 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, auto addExports = [&](ArrayRef Exp, MachineTypes M) -> Error { + StringMap RegularImports; + struct Deferred { + std::string Name; + ImportType ImpType; + const COFFShortExport *Export; + }; + SmallVector Renames; for (const COFFShortExport &E : Exp) { if (E.Private) continue; @@ -690,14 +711,6 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, Name.swap(*ReplacedName); } - if (!E.ImportName.empty() && Name != E.ImportName) { - if (ImportType == IMPORT_CODE) - Members.push_back( - OF.createWeakExternal(E.ImportName, Name, false, M)); - Members.push_back(OF.createWeakExternal(E.ImportName, Name, true, M)); - continue; - } - ImportNameType NameType; std::string ExportName; if (E.Noname) { @@ -705,6 +718,27 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, } else if (!E.ExportAs.empty()) { NameType = IMPORT_NAME_EXPORTAS; ExportName = E.ExportAs; + } else if (!E.ImportName.empty()) { + // If we need to import from a specific ImportName, we may need to use + // a weak alias (which needs another import to point at). But if we can + // express ImportName based on the symbol name and a specific NameType, + // prefer that over an alias. + if (Machine == IMAGE_FILE_MACHINE_I386 && + applyNameType(IMPORT_NAME_UNDECORATE, Name) == E.ImportName) + NameType = IMPORT_NAME_UNDECORATE; + else if (Machine == IMAGE_FILE_MACHINE_I386 && + applyNameType(IMPORT_NAME_NOPREFIX, Name) == E.ImportName) + NameType = IMPORT_NAME_NOPREFIX; + else if (Name == E.ImportName) + NameType = IMPORT_NAME; + else { + Deferred D; + D.Name = Name; + D.ImpType = ImportType; + D.Export = &E; + Renames.push_back(D); + continue; + } } else { NameType = getNameType(SymbolName, E.Name, M, MinGW); } @@ -724,9 +758,25 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path, } } + RegularImports[applyNameType(NameType, Name)] = Name; Members.push_back(OF.createShortImport(Name, E.Ordinal, ImportType, NameType, ExportName, M)); } + for (const auto &D : Renames) { + auto It = RegularImports.find(D.Export->ImportName); + if (It != RegularImports.end()) { + // We have a regular import entry for a symbol with the name we + // want to reference; produce an alias pointing at that. + StringRef Symbol = It->second; + if (D.ImpType == IMPORT_CODE) + Members.push_back(OF.createWeakExternal(Symbol, D.Name, false, M)); + Members.push_back(OF.createWeakExternal(Symbol, D.Name, true, M)); + } else { + Members.push_back(OF.createShortImport(D.Name, D.Export->Ordinal, + D.ImpType, IMPORT_NAME_EXPORTAS, + D.Export->ImportName, M)); + } + } return Error::success(); }; diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp index 0c0bef1319e443..82c18539658e84 100644 --- a/llvm/lib/Object/COFFModuleDefinition.cpp +++ b/llvm/lib/Object/COFFModuleDefinition.cpp @@ -282,8 +282,6 @@ class Parser { if (Tok.K == EqualEqual) { read(); E.ImportName = std::string(Tok.Value); - if (AddUnderscores && !isDecorated(E.ImportName, MingwDef)) - E.ImportName = std::string("_").append(E.ImportName); continue; } // EXPORTAS must be at the end of export definition diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 7e29da4d5e948f..2a2b235461a550 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/MC/StringTableBuilder.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Object/SymbolicFile.h" @@ -46,9 +47,6 @@ static cl::opt DisableBitcodeVersionUpgrade( cl::desc("Disable automatic bitcode upgrade for version mismatch")); static const char *PreservedSymbols[] = { -#define HANDLE_LIBCALL(code, name) name, -#include "llvm/IR/RuntimeLibcalls.def" -#undef HANDLE_LIBCALL // There are global variables, so put it here instead of in // RuntimeLibcalls.def. // TODO: Are there similar such variables? @@ -215,9 +213,16 @@ Expected Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static DenseSet buildPreservedSymbolsSet() { - return DenseSet(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); +static DenseSet buildPreservedSymbolsSet(const Triple &TT) { + DenseSet PreservedSymbolSet(std::begin(PreservedSymbols), + std::end(PreservedSymbols)); + + RTLIB::RuntimeLibcallsInfo Libcalls(TT); + for (const char *Name : Libcalls.getLibcallNames()) { + if (Name) + PreservedSymbolSet.insert(Name); + } + return PreservedSymbolSet; } Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, @@ -276,7 +281,8 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, setStr(Sym.IRName, GV->getName()); static const DenseSet PreservedSymbolsSet = - buildPreservedSymbolsSet(); + buildPreservedSymbolsSet( + llvm::Triple(GV->getParent()->getTargetTriple())); bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); if (Used.count(GV) || IsPreservedSymbol) diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp index 25a60f379fc222..6efb8759d13fbd 100644 --- a/llvm/lib/Object/XCOFFObjectFile.cpp +++ b/llvm/lib/Object/XCOFFObjectFile.cpp @@ -1369,7 +1369,7 @@ Expected XCOFFSymbolRef::getName() const { return getObject()->getStringTableEntry(getSymbol64()->Offset); } -// Explictly instantiate template classes. +// Explicitly instantiate template classes. template struct XCOFFSectionHeader; template struct XCOFFSectionHeader; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index d4eca4a48e55d1..929690c2c74d6c 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -100,9 +100,11 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/SafeStack.h" @@ -112,6 +114,7 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TwoAddressInstructionPass.h" #include "llvm/CodeGen/TypePromotion.h" #include "llvm/CodeGen/WasmEHPrepare.h" #include "llvm/CodeGen/WinEHPrepare.h" diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 2ed0e237d8de7d..fc7b82d522bf0e 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineVerifier.h" #include "llvm/Demangle/Demangle.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -1451,10 +1452,10 @@ void PreservedCFGCheckerInstrumentation::registerCallbacks( }); } -void VerifyInstrumentation::registerCallbacks( - PassInstrumentationCallbacks &PIC) { +void VerifyInstrumentation::registerCallbacks(PassInstrumentationCallbacks &PIC, + ModuleAnalysisManager *MAM) { PIC.registerAfterPassCallback( - [this](StringRef P, Any IR, const PreservedAnalyses &PassPA) { + [this, MAM](StringRef P, Any IR, const PreservedAnalyses &PassPA) { if (isIgnored(P) || P == "VerifierPass") return; const auto *F = unwrapIR(IR); @@ -1488,15 +1489,23 @@ void VerifyInstrumentation::registerCallbacks( P)); } - // TODO: Use complete MachineVerifierPass. if (auto *MF = unwrapIR(IR)) { if (DebugLogging) dbgs() << "Verifying machine function " << MF->getName() << '\n'; - verifyMachineFunction( + std::string Banner = formatv("Broken machine function found after pass " "\"{0}\", compilation aborted!", - P), - *MF); + P); + if (MAM) { + Module &M = const_cast(*MF->getFunction().getParent()); + auto &MFAM = + MAM->getResult(M) + .getManager(); + MachineVerifierPass Verifier(Banner); + Verifier.run(const_cast(*MF), MFAM); + } else { + verifyMachineFunction(Banner, *MF); + } } } }); @@ -2515,7 +2524,7 @@ void StandardInstrumentations::registerCallbacks( PrintChangedIR.registerCallbacks(PIC); PseudoProbeVerification.registerCallbacks(PIC); if (VerifyEach) - Verify.registerCallbacks(PIC); + Verify.registerCallbacks(PIC, MAM); PrintChangedDiff.registerCallbacks(PIC); WebsiteChangeReporter.registerCallbacks(PIC); ChangeTester.registerCallbacks(PIC); diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 41b66c07bfd436..a3f350e9ca8b03 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -262,6 +262,117 @@ const char *Instruction::getOpcodeName(Opcode Opc) { llvm_unreachable("Unknown Opcode"); } +llvm::Instruction *Instruction::getTopmostLLVMInstruction() const { + Instruction *Prev = getPrevNode(); + if (Prev == nullptr) { + // If at top of the BB, return the first BB instruction. + return &*cast(getParent()->Val)->begin(); + } + // Else get the Previous sandbox IR instruction's bottom IR instruction and + // return its successor. + llvm::Instruction *PrevBotI = cast(Prev->Val); + return PrevBotI->getNextNode(); +} + +BBIterator Instruction::getIterator() const { + auto *I = cast(Val); + return BasicBlock::iterator(I->getParent(), I->getIterator(), &Ctx); +} + +Instruction *Instruction::getNextNode() const { + assert(getParent() != nullptr && "Detached!"); + assert(getIterator() != getParent()->end() && "Already at end!"); + auto *LLVMI = cast(Val); + assert(LLVMI->getParent() != nullptr && "LLVM IR instr is detached!"); + auto *NextLLVMI = LLVMI->getNextNode(); + auto *NextI = cast_or_null(Ctx.getValue(NextLLVMI)); + if (NextI == nullptr) + return nullptr; + return NextI; +} + +Instruction *Instruction::getPrevNode() const { + assert(getParent() != nullptr && "Detached!"); + auto It = getIterator(); + if (It != getParent()->begin()) + return std::prev(getIterator()).get(); + return nullptr; +} + +void Instruction::removeFromParent() { + // Detach all the LLVM IR instructions from their parent BB. + for (llvm::Instruction *I : getLLVMInstrs()) + I->removeFromParent(); +} + +void Instruction::eraseFromParent() { + assert(users().empty() && "Still connected to users, can't erase!"); + std::unique_ptr Detached = Ctx.detach(this); + // We don't have Tracking yet, so just erase the LLVM IR instructions. + // Erase in reverse to avoid erasing nstructions with attached uses. + auto Instrs = getLLVMInstrs(); + for (llvm::Instruction *I : reverse(Instrs)) + I->eraseFromParent(); +} + +void Instruction::moveBefore(BasicBlock &BB, const BBIterator &WhereIt) { + if (std::next(getIterator()) == WhereIt) + // Destination is same as origin, nothing to do. + return; + auto *LLVMBB = cast(BB.Val); + llvm::BasicBlock::iterator It; + if (WhereIt == BB.end()) { + It = LLVMBB->end(); + } else { + Instruction *WhereI = &*WhereIt; + It = WhereI->getTopmostLLVMInstruction()->getIterator(); + } + // TODO: Move this to the verifier of sandboxir::Instruction. + assert(is_sorted(getLLVMInstrs(), + [](auto *I1, auto *I2) { return I1->comesBefore(I2); }) && + "Expected program order!"); + // Do the actual move in LLVM IR. + for (auto *I : getLLVMInstrs()) + I->moveBefore(*LLVMBB, It); +} + +void Instruction::insertBefore(Instruction *BeforeI) { + llvm::Instruction *BeforeTopI = BeforeI->getTopmostLLVMInstruction(); + // TODO: Move this to the verifier of sandboxir::Instruction. + assert(is_sorted(getLLVMInstrs(), + [](auto *I1, auto *I2) { return I1->comesBefore(I2); }) && + "Expected program order!"); + for (llvm::Instruction *I : getLLVMInstrs()) + I->insertBefore(BeforeTopI); +} + +void Instruction::insertAfter(Instruction *AfterI) { + insertInto(AfterI->getParent(), std::next(AfterI->getIterator())); +} + +void Instruction::insertInto(BasicBlock *BB, const BBIterator &WhereIt) { + llvm::BasicBlock *LLVMBB = cast(BB->Val); + llvm::Instruction *LLVMBeforeI; + llvm::BasicBlock::iterator LLVMBeforeIt; + if (WhereIt != BB->end()) { + Instruction *BeforeI = &*WhereIt; + LLVMBeforeI = BeforeI->getTopmostLLVMInstruction(); + LLVMBeforeIt = LLVMBeforeI->getIterator(); + } else { + LLVMBeforeI = nullptr; + LLVMBeforeIt = LLVMBB->end(); + } + for (llvm::Instruction *I : getLLVMInstrs()) + I->insertInto(LLVMBB, LLVMBeforeIt); +} + +BasicBlock *Instruction::getParent() const { + auto *BB = cast(Val)->getParent(); + if (BB == nullptr) + return nullptr; + return cast(Ctx.getValue(BB)); +} + bool Instruction::classof(const sandboxir::Value *From) { switch (From->getSubclassID()) { #define DEF_INSTR(ID, OPC, CLASS) \ @@ -344,6 +455,24 @@ BasicBlock::iterator::getInstr(llvm::BasicBlock::iterator It) const { return cast_or_null(Ctx->getValue(&*It)); } +std::unique_ptr Context::detachLLVMValue(llvm::Value *V) { + std::unique_ptr Erased; + auto It = LLVMValueToValueMap.find(V); + if (It != LLVMValueToValueMap.end()) { + auto *Val = It->second.release(); + Erased = std::unique_ptr(Val); + LLVMValueToValueMap.erase(It); + } + return Erased; +} + +std::unique_ptr Context::detach(Value *V) { + assert(V->getSubclassID() != Value::ClassID::Constant && + "Can't detach a constant!"); + assert(V->getSubclassID() != Value::ClassID::User && "Can't detach a user!"); + return detachLLVMValue(V->Val); +} + Value *Context::registerValue(std::unique_ptr &&VPtr) { assert(VPtr->getSubclassID() != Value::ClassID::User && "Can't register a user!"); diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 6e679f74869f0f..cf05db546e0214 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -516,7 +516,7 @@ static bool is_local_impl(struct STATVFS &Vfs) { // target StringRef fstype(Vfs.f_basetype); // NFS is the only non-local fstype?? - return !fstype.equals("nfs"); + return fstype != "nfs"; #elif defined(_AIX) // Call mntctl; try more than twice in case of timing issues with a concurrent // mount. diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index e3c5a143b28892..527496f1a63749 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -35,7 +35,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/IR/Argument.h" diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index c11e1195903e57..0f1e860fac7322 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -978,11 +978,7 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, // For GPRs, we only care to clear out the 64-bit register. if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) GPRsToZero.set(XReg); - } else if (AArch64::FPR128RegClass.contains(Reg) || - AArch64::FPR64RegClass.contains(Reg) || - AArch64::FPR32RegClass.contains(Reg) || - AArch64::FPR16RegClass.contains(Reg) || - AArch64::FPR8RegClass.contains(Reg)) { + } else if (AArch64InstrInfo::isFpOrNEON(Reg)) { // For FPRs, if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) FPRsToZero.set(XReg); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7294da474c4bc1..df9b0ae1a632f3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -48,7 +48,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -3870,10 +3870,15 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // cmp w13, w12 // can be turned into: // cmp w12, w11, lsl #1 - if (!isa(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) { - SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; - - if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { + if (!isa(RHS) || + !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) { + bool LHSIsCMN = isCMN(LHS, CC); + bool RHSIsCMN = isCMN(RHS, CC); + SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS; + SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS; + + if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) > + getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 047c852bb01d25..fcdd47541be828 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -907,6 +907,8 @@ class AArch64TargetLowering : public TargetLowering { bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool shouldExpandCmpUsingSelects() const override { return true; } + bool isComplexDeinterleavingSupported() const override; bool isComplexDeinterleavingOperationSupported( ComplexDeinterleavingOperation Operation, Type *Ty) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index eb8730a8c8dca4..1b301a4a05fc5b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4188,17 +4188,24 @@ bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) { } } +bool AArch64InstrInfo::isFpOrNEON(Register Reg) { + if (Reg == 0) + return false; + assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON"); + return AArch64::FPR128RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR32RegClass.contains(Reg) || + AArch64::FPR16RegClass.contains(Reg) || + AArch64::FPR8RegClass.contains(Reg); +} + bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { auto IsFPR = [&](const MachineOperand &Op) { if (!Op.isReg()) return false; auto Reg = Op.getReg(); if (Reg.isPhysical()) - return AArch64::FPR128RegClass.contains(Reg) || - AArch64::FPR64RegClass.contains(Reg) || - AArch64::FPR32RegClass.contains(Reg) || - AArch64::FPR16RegClass.contains(Reg) || - AArch64::FPR8RegClass.contains(Reg); + return isFpOrNEON(Reg); const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); return TRC == &AArch64::FPR128RegClass || diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 792e0c3063b101..69ee0a70765e1c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -251,6 +251,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { /// Returns the immediate offset operator of a load/store. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + /// Returns whether the physical register is FP or NEON. + static bool isFpOrNEON(Register Reg); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 152a6c2e95b278..dd11f748821153 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2700,6 +2700,11 @@ def : InstAlias<"tst $src1, $src2$sh", def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>; def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; +// Emit (and 0xFFFFFFFF) as a ORRWrr move which may be eliminated. +let AddedComplexity = 6 in +def : Pat<(i64 (and GPR64:$Rn, 0xffffffff)), + (SUBREG_TO_REG (i64 0), (ORRWrr WZR, (EXTRACT_SUBREG GPR64:$Rn, sub_32)), sub_32)>; + //===----------------------------------------------------------------------===// // One operand data processing instructions. diff --git a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 367594f8614da0..f07afe7089aa69 100644 --- a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -16,6 +16,7 @@ #include "AArch64PBQPRegAlloc.h" #include "AArch64.h" +#include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -32,14 +33,6 @@ using namespace llvm; namespace { -#ifndef NDEBUG -bool isFPReg(unsigned reg) { - return AArch64::FPR32RegClass.contains(reg) || - AArch64::FPR64RegClass.contains(reg) || - AArch64::FPR128RegClass.contains(reg); -} -#endif - bool isOdd(unsigned reg) { switch (reg) { default: @@ -147,8 +140,10 @@ bool isOdd(unsigned reg) { } bool haveSameParity(unsigned reg1, unsigned reg2) { - assert(isFPReg(reg1) && "Expecting an FP register for reg1"); - assert(isFPReg(reg2) && "Expecting an FP register for reg2"); + assert(AArch64InstrInfo::isFpOrNEON(reg1) && + "Expecting an FP register for reg1"); + assert(AArch64InstrInfo::isFpOrNEON(reg2) && + "Expecting an FP register for reg2"); return isOdd(reg1) == isOdd(reg2); } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index dfaa67dd1959d4..4dc33e6168cbda 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -160,25 +160,30 @@ def GPR64common : RegisterClass<"AArch64", [i64], 64, (add (sequence "X%u", 0, 28), FP, LR)> { let AltOrders = [(rotl GPR64common, 8)]; let AltOrderSelect = [{ return 1; }]; + let DecoderMethod = "DecodeSimpleRegisterClass"; } // GPR register classes which exclude SP/WSP. def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> { let AltOrders = [(rotl GPR32, 8)]; let AltOrderSelect = [{ return 1; }]; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> { let AltOrders = [(rotl GPR64, 8)]; let AltOrderSelect = [{ return 1; }]; + let DecoderMethod = "DecodeSimpleRegisterClass"; } // GPR register classes which include SP/WSP. def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> { let AltOrders = [(rotl GPR32sp, 8)]; let AltOrderSelect = [{ return 1; }]; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> { let AltOrders = [(rotl GPR64sp, 8)]; let AltOrderSelect = [{ return 1; }]; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>; @@ -446,18 +451,24 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> { let Size = 8; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def FPR16 : RegisterClass<"AArch64", [f16, bf16, i16], 16, (sequence "H%u", 0, 31)> { let Size = 16; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> { let Size = 16; } -def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; +def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16], - 64, (sequence "D%u", 0, 31)>; + 64, (sequence "D%u", 0, 31)> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} def FPR64_lo : RegisterClass<"AArch64", [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32, v1f64], @@ -469,21 +480,27 @@ def FPR64_lo : RegisterClass<"AArch64", def FPR128 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, v8f16, v8bf16], - 128, (sequence "Q%u", 0, 31)>; + 128, (sequence "Q%u", 0, 31)> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, v8bf16], - 128, (trunc FPR128, 16)>; + 128, (trunc FPR128, 16)> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} // The lower 8 vector registers. Some instructions can only take registers // in this range. def FPR128_0to7 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, v8bf16], - 128, (trunc FPR128, 8)>; + 128, (trunc FPR128, 8)> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} // Pairs, triples, and quads of 64-bit vector registers. def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>; @@ -495,12 +512,15 @@ def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3], (rotl FPR64, 2), (rotl FPR64, 3)]>; def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> { let Size = 128; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> { let Size = 192; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> { let Size = 256; + let DecoderMethod = "DecodeSimpleRegisterClass"; } // Pairs, triples, and quads of 128-bit vector registers. @@ -513,12 +533,15 @@ def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3], (rotl FPR128, 2), (rotl FPR128, 3)]>; def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> { let Size = 256; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> { let Size = 384; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { let Size = 512; + let DecoderMethod = "DecodeSimpleRegisterClass"; } @@ -904,9 +927,15 @@ class PPRClass : RegisterClass< let Size = 16; } -def PPR : PPRClass<0, 15>; -def PPR_3b : PPRClass<0, 7>; // Restricted 3 bit SVE predicate register class. -def PPR_p8to15 : PPRClass<8, 15>; +def PPR : PPRClass<0, 15> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} +def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class. + let DecoderMethod = "DecodeSimpleRegisterClass"; +} +def PPR_p8to15 : PPRClass<8, 15> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} class PPRAsmOperand : AsmOperandClass { let Name = "SVE" # name # "Reg"; @@ -941,7 +970,9 @@ class PNRClass : RegisterClass< let Size = 16; } -def PNR : PNRClass<0, 15>; +def PNR : PNRClass<0, 15> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} def PNR_3b : PNRClass<0, 7>; def PNR_p8to15 : PNRClass<8, 15>; @@ -982,7 +1013,7 @@ class PNRP8to15RegOp { let PrintMethod = "printPredicateAsCounter<" # Width # ">"; let EncoderMethod = "EncodePNR_p8to15"; - let DecoderMethod = "DecodePNR_p8to15RegisterClass"; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def PNRAny_p8to15 : PNRP8to15RegOp<"", PNRAsmAny_p8to15, 0, PNR_p8to15>; @@ -1013,7 +1044,9 @@ class PPRorPNRAsmOperand: AsmOperandCla let ParserMethod = "tryParseSVEPredicateOrPredicateAsCounterVector"; } -def PPRorPNR : PPRorPNRClass; +def PPRorPNR : PPRorPNRClass { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} def PPRorPNRAsmOp8 : PPRorPNRAsmOperand<"PPRorPNRB", "PPRorPNR", 8>; def PPRorPNRAsmOpAny : PPRorPNRAsmOperand<"PPRorPNRAny", "PPRorPNR", 0>; def PPRorPNRAny : PPRRegOp<"", PPRorPNRAsmOpAny, ElementSizeNone, PPRorPNR>; @@ -1024,6 +1057,7 @@ def PSeqPairs : RegisterTuples<[psub0, psub1], [(rotl PPR, 0), (rotl PPR, 1)]>; def PPR2 : RegisterClass<"AArch64", [untyped], 16, (add PSeqPairs)> { let Size = 32; + let DecoderMethod = "DecodeSimpleRegisterClass"; } class PPRVectorList : AsmOperandClass { @@ -1097,9 +1131,15 @@ class ZPRClass : RegisterClass<"AArch64", let Size = 128; } -def ZPR : ZPRClass<31>; -def ZPR_4b : ZPRClass<15>; // Restricted 4 bit SVE vector register class. -def ZPR_3b : ZPRClass<7>; // Restricted 3 bit SVE vector register class. +def ZPR : ZPRClass<31> { + let DecoderMethod = "DecodeSimpleRegisterClass"; +} +def ZPR_4b : ZPRClass<15> { // Restricted 4 bit SVE vector register class. + let DecoderMethod = "DecodeSimpleRegisterClass"; +} +def ZPR_3b : ZPRClass<7> { // Restricted 3 bit SVE vector register class. + let DecoderMethod = "DecodeSimpleRegisterClass"; +} class ZPRAsmOperand : AsmOperandClass { @@ -1176,12 +1216,15 @@ def ZSeqQuads : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [(rotl ZPR, 0), ( def ZPR2 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqPairs)> { let Size = 256; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def ZPR3 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqTriples)> { let Size = 384; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def ZPR4 : RegisterClass<"AArch64", [untyped], 128, (add ZSeqQuads)> { let Size = 512; + let DecoderMethod = "DecodeSimpleRegisterClass"; } class ZPRVectorList : AsmOperandClass { @@ -1379,10 +1422,12 @@ def ZStridedQuadsHi : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [ def ZPR2Strided : RegisterClass<"AArch64", [untyped], 128, (add ZStridedPairsLo, ZStridedPairsHi)> { let Size = 256; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def ZPR4Strided : RegisterClass<"AArch64", [untyped], 128, (add ZStridedQuadsLo, ZStridedQuadsHi)> { let Size = 512; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128, @@ -1401,7 +1446,7 @@ class ZPRVectorListStrided } let EncoderMethod = "EncodeZPR2StridedRegisterClass", - DecoderMethod = "DecodeZPR2StridedRegisterClass" in { + DecoderMethod = "DecodeSimpleRegisterClass" in { def ZZ_b_strided : RegisterOperand"> { let ParserMatchClass = ZPRVectorListStrided<8, 2, 8>; @@ -1439,7 +1484,7 @@ def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128, } let EncoderMethod = "EncodeZPR4StridedRegisterClass", - DecoderMethod = "DecodeZPR4StridedRegisterClass" in { + DecoderMethod = "DecodeSimpleRegisterClass" in { def ZZZZ_b_strided : RegisterOperand"> { let ParserMatchClass = ZPRVectorListStrided<8, 4, 4>; @@ -1774,9 +1819,11 @@ def MatrixTileList : MatrixTileListOperand<>; def MatrixIndexGPR32_8_11 : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 8, 11)> { let DiagnosticType = "InvalidMatrixIndexGPR32_8_11"; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def MatrixIndexGPR32_12_15 : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 12, 15)> { let DiagnosticType = "InvalidMatrixIndexGPR32_12_15"; + let DecoderMethod = "DecodeSimpleRegisterClass"; } def MatrixIndexGPR32Op8_11 : RegisterOperand { let EncoderMethod = "encodeMatrixIndexGPR32"; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0ee8136884119e..45148449dfb821 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2785,7 +2785,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, std::pair LT = getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext())); unsigned NumElements = AArch64::SVEBitsPerBlock / - LT.second.getVectorElementType().getSizeInBits(); + LT.second.getScalarSizeInBits(); return AdjustCost( LT.first * getCastInstrCost( diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index ddb875e73ff5a9..b97f00c9931122 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -38,100 +38,19 @@ using DecodeStatus = MCDisassembler::DecodeStatus; // Forward declare these because the autogenerated code will reference them. // Definitions are further down. -static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, +template +static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeFPR128_0to7RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodeMatrixIndexGPR32_8_11RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); -static DecodeStatus -DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeZPR2StridedRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeZPR4StridedRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); template static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -140,24 +59,6 @@ static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePPRorPNRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder); -static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus -DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const MCDisassembler *Decoder); -static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -314,8 +215,8 @@ static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn, #define SoftFail MCDisassembler::SoftFail static MCDisassembler *createAArch64Disassembler(const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { + const MCSubtargetInfo &STI, + MCContext &Ctx) { return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo()); } @@ -426,103 +327,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() { createAArch64ExternalSymbolizer); } -static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, +template +static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const MCDisassembler *Decoder) { - if (RegNo > 31) + if (RegNo > NumRegsInClass - 1) return Fail; unsigned Register = - AArch64MCRegisterClasses[AArch64::FPR128RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus -DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 15) - return Fail; - return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); -} - -static DecodeStatus -DecodeFPR128_0to7RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return Fail; - return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); -} - -static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::FPR64RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::FPR32RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::FPR16RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::FPR8RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus -DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 30) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::GPR64commonRegClassID].getRegister( - RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::GPR64RegClassID].getRegister(RegNo); + AArch64MCRegisterClasses[RegClassID].getRegister(RegNo + FirstReg); Inst.addOperand(MCOperand::createReg(Register)); return Success; } @@ -542,129 +355,6 @@ DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, return Success; } -static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus -DecodeMatrixIndexGPR32_8_11RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { - if (RegNo > 3) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::MatrixIndexGPR32_8_11RegClassID] - .getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus -DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 3) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::MatrixIndexGPR32_12_15RegClassID] - .getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::GPR32RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::GPR32spRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPRRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 15) - return Fail; - return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder); -} - -static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return Fail; - return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder); -} - -static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR2RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR3RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR4RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -687,30 +377,6 @@ static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeZPR2StridedRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { - if (RegNo > 15) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR2StridedRegClassID].getRegister( - RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeZPR4StridedRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { - if (RegNo > 7) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::ZPR4StridedRegClassID].getRegister( - RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, uint64_t Address, @@ -744,74 +410,6 @@ static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodePPRorPNRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 15) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::PPRorPNRRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 15) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::PPRRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 15) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::PNRRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return Fail; - - // Just reuse the PPR decode table - return DecodePPRRegisterClass(Inst, RegNo, Addr, Decoder); -} - -static DecodeStatus -DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 7) - return Fail; - - // Just reuse the PPR decode table - return DecodePNRRegisterClass(Inst, RegNo + 8, Addr, Decoder); -} - -static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { - if (RegNo > 15) - return Fail; - - unsigned Register = - AArch64MCRegisterClasses[AArch64::PPR2RegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -823,72 +421,6 @@ static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::QQRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::QQQRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::QQQQRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::DDRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::DDDRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - -static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const MCDisassembler *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = - AArch64MCRegisterClasses[AArch64::DDDDRegClassID].getRegister(RegNo); - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { @@ -938,7 +470,7 @@ static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, uint64_t Address, const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1)); + Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1)); Inst.addOperand(MCOperand::createImm(Imm & 1)); return Success; } @@ -971,11 +503,15 @@ static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); if (IsToVec) { - DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Address, Decoder); + DecodeSimpleRegisterClass( + Inst, Rn, Address, Decoder); } else { - DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); - DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Address, Decoder); + DecodeSimpleRegisterClass( + Inst, Rn, Address, Decoder); } // Add the lane @@ -1093,9 +629,12 @@ DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, // if sf == '0' and imm6<5> == '1' then ReservedValue() if (shiftLo >> 5 == 1) return Fail; - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; } case AArch64::ADDXrs: @@ -1114,9 +653,12 @@ DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, case AArch64::ORNXrs: case AArch64::EORXrs: case AArch64::EONXrs: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; } @@ -1139,12 +681,14 @@ static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, case AArch64::MOVKWi: if (shift & (1U << 5)) return Fail; - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); break; case AArch64::MOVZXi: case AArch64::MOVNXi: case AArch64::MOVKXi: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); break; } @@ -1179,38 +723,46 @@ DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, case AArch64::LDRSHWui: case AArch64::STRWui: case AArch64::LDRWui: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRSBXui: case AArch64::LDRSHXui: case AArch64::LDRSWui: case AArch64::STRXui: case AArch64::LDRXui: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRQui: case AArch64::STRQui: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRDui: case AArch64::STRDui: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRSui: case AArch64::STRSui: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRHui: case AArch64::STRHui: - DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDRBui: case AArch64::STRBui: - DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; } - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); if (!Decoder->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(offset)); return Success; @@ -1278,7 +830,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRBpre: case AArch64::LDRBpost: case AArch64::STRBpost: - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); break; } @@ -1329,7 +882,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::LDAPURHi: case AArch64::LDAPURSHWi: case AArch64::LDAPURi: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURSBXi: case AArch64::LDURSHXi: @@ -1356,7 +910,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::LDAPURSBXi: case AArch64::STLURXi: case AArch64::LDAPURXi: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURQi: case AArch64::STURQi: @@ -1364,7 +919,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRQpre: case AArch64::LDRQpost: case AArch64::STRQpost: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURDi: case AArch64::STURDi: @@ -1372,7 +928,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRDpre: case AArch64::LDRDpost: case AArch64::STRDpost: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURSi: case AArch64::STURSi: @@ -1380,7 +937,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRSpre: case AArch64::LDRSpost: case AArch64::STRSpost: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURHi: case AArch64::STURHi: @@ -1388,7 +946,8 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRHpre: case AArch64::LDRHpost: case AArch64::STRHpost: - DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::LDURBi: case AArch64::STURBi: @@ -1396,11 +955,13 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STRBpre: case AArch64::LDRBpost: case AArch64::STRBpost: - DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; } - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(offset)); bool IsLoad = fieldFromInstruction(insn, 22, 1); @@ -1432,7 +993,8 @@ DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, case AArch64::STXRW: case AArch64::STXRB: case AArch64::STXRH: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rs, Addr, + Decoder); [[fallthrough]]; case AArch64::LDARW: case AArch64::LDARB: @@ -1452,11 +1014,13 @@ DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, case AArch64::LDLARW: case AArch64::LDLARB: case AArch64::LDLARH: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::STLXRX: case AArch64::STXRX: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rs, Addr, + Decoder); [[fallthrough]]; case AArch64::LDARX: case AArch64::LDAXRX: @@ -1464,29 +1028,37 @@ DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, case AArch64::STLRX: case AArch64::LDLARX: case AArch64::STLLRX: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); break; case AArch64::STLXPW: case AArch64::STXPW: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rs, Addr, + Decoder); [[fallthrough]]; case AArch64::LDAXPW: case AArch64::LDXPW: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; case AArch64::STLXPX: case AArch64::STXPX: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rs, Addr, + Decoder); [[fallthrough]]; case AArch64::LDAXPX: case AArch64::LDXPX: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; } - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); // You shouldn't load to the same register twice in an instruction... if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW || @@ -1542,7 +1114,8 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STPSpre: case AArch64::STGPpre: case AArch64::STGPpost: - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); break; } @@ -1565,8 +1138,10 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STPXi: case AArch64::LDPSWi: case AArch64::STGPi: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; case AArch64::LDPWpost: case AArch64::STPWpost: @@ -1578,8 +1153,10 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STNPWi: case AArch64::LDPWi: case AArch64::STPWi: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; case AArch64::LDNPQi: case AArch64::STNPQi: @@ -1589,8 +1166,10 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STPQi: case AArch64::LDPQpre: case AArch64::STPQpre: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; case AArch64::LDNPDi: case AArch64::STNPDi: @@ -1600,8 +1179,10 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STPDi: case AArch64::LDPDpre: case AArch64::STPDpre: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; case AArch64::LDNPSi: case AArch64::STNPSi: @@ -1611,12 +1192,15 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, case AArch64::STPSi: case AArch64::LDPSpre: case AArch64::STPSpre: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rt2, Addr, + Decoder); break; } - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(offset)); // You shouldn't load to the same register twice in an instruction... @@ -1645,16 +1229,18 @@ static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, return Fail; case AArch64::LDRAAwriteback: case AArch64::LDRABwriteback: - DecodeGPR64spRegisterClass(Inst, Rn /* writeback register */, Addr, - Decoder); + DecodeSimpleRegisterClass( + Inst, Rn /* writeback register */, Addr, Decoder); break; case AArch64::LDRAAindexed: case AArch64::LDRABindexed: break; } - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); DecodeSImm<10>(Inst, offset, Addr, Decoder); if (writeback && Rt == Rn && Rn != 31) { @@ -1681,39 +1267,57 @@ static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, return Fail; case AArch64::ADDWrx: case AArch64::SUBWrx: - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::ADDSWrx: case AArch64::SUBSWrx: - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::ADDXrx: case AArch64::SUBXrx: - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::ADDSXrx: case AArch64::SUBSXrx: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::ADDXrx64: case AArch64::SUBXrx64: - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::SUBSXrx64: case AArch64::ADDSXrx64: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; } @@ -1731,19 +1335,25 @@ static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, if (Datasize) { if (Inst.getOpcode() == AArch64::ANDSXri) - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); else - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); imm = fieldFromInstruction(insn, 10, 13); if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64)) return Fail; } else { if (Inst.getOpcode() == AArch64::ANDSWri) - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); else - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); imm = fieldFromInstruction(insn, 10, 12); if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32)) return Fail; @@ -1761,9 +1371,11 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, imm |= fieldFromInstruction(insn, 5, 5); if (Inst.getOpcode() == AArch64::MOVID) - DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); else - DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(imm)); @@ -1800,8 +1412,10 @@ static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, imm |= fieldFromInstruction(insn, 5, 5); // Tied operands added twice. - DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder); - DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(imm)); Inst.addOperand(MCOperand::createImm((cmode & 6) << 2)); @@ -1820,7 +1434,8 @@ static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, if (imm & (1 << (21 - 1))) imm |= ~((1LL << 21) - 1); - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); if (!Decoder->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(imm)); @@ -1844,16 +1459,22 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, if (Datasize) { if (Rd == 31 && !S) - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder); else - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); } else { if (Rd == 31 && !S) - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder); else - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rd, Addr, + Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); } if (!Decoder->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 0, 4)) @@ -1939,9 +1560,11 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, dst |= ~((1LL << 14) - 1); if (fieldFromInstruction(insn, 31, 1) == 0) - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); else - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(bit)); if (!Decoder->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(dst)); @@ -1965,17 +1588,15 @@ DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegClassID, static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const MCDisassembler *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, - AArch64::WSeqPairsClassRegClassID, - RegNo, Addr, Decoder); + return DecodeGPRSeqPairsClassRegisterClass( + Inst, AArch64::WSeqPairsClassRegClassID, RegNo, Addr, Decoder); } static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const MCDisassembler *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, - AArch64::XSeqPairsClassRegClassID, - RegNo, Addr, Decoder); + return DecodeGPRSeqPairsClassRegisterClass( + Inst, AArch64::XSeqPairsClassRegClassID, RegNo, Addr, Decoder); } static DecodeStatus DecodeSyspXzrInstruction(MCInst &Inst, uint32_t insn, @@ -1993,7 +1614,8 @@ static DecodeStatus DecodeSyspXzrInstruction(MCInst &Inst, uint32_t insn, Inst.addOperand(MCOperand::createImm(CRn)); Inst.addOperand(MCOperand::createImm(CRm)); Inst.addOperand(MCOperand::createImm(op2)); - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rt, Addr, + Decoder); return Success; } @@ -2007,9 +1629,11 @@ DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, return Fail; // The same (tied) operand is added twice to the instruction. - DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Zdn, Addr, + Decoder); if (Inst.getOpcode() != AArch64::DUPM_ZI) - DecodeZPRRegisterClass(Inst, Zdn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Zdn, Addr, + Decoder); Inst.addOperand(MCOperand::createImm(imm)); return Success; } @@ -2018,7 +1642,7 @@ template static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address, const MCDisassembler *Decoder) { if (Imm & ~((1LL << Bits) - 1)) - return Fail; + return Fail; // Imm is a signed immediate, so sign extend it. if (Imm & (1 << (Bits - 1))) @@ -2072,12 +1696,18 @@ static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, // All three register operands are written back, so they all appear // twice in the operand list, once as outputs and once as inputs. - if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || - !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) || - !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || - !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || - !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) || - !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder)) + if (!DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rs, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rn, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rs, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rn, Addr, Decoder)) return MCDisassembler::Fail; return MCDisassembler::Success; @@ -2097,11 +1727,16 @@ static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, // Rd and Rn (not Rm) register operands are written back, so they appear // twice in the operand list, once as outputs and once as inputs. - if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || - !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || - !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || - !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || - !DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder)) + if (!DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rn, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rd, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rn, Addr, Decoder) || + !DecodeSimpleRegisterClass( + Inst, Rm, Addr, Decoder)) return MCDisassembler::Fail; return MCDisassembler::Success; @@ -2123,16 +1758,19 @@ static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Rm = fieldFromInstruction(insn, 16, 5); Inst.addOperand(MCOperand::createImm(Rt)); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rn, Addr, + Decoder); switch (Inst.getOpcode()) { default: return Fail; case AArch64::PRFMroW: - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; case AArch64::PRFMroX: - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + DecodeSimpleRegisterClass(Inst, Rm, Addr, + Decoder); break; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 7a29457f5442f2..d42d5511a82422 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -711,7 +711,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}}); getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) + .legalFor({{s32, s32}, + {s64, s32}, + {s32, s64}, + {s64, s64}, + {v2s64, v2s64}, + {v4s32, v4s32}, + {v2s32, v2s32}}) .legalIf([=](const LegalityQuery &Query) { return HasFP16 && (Query.Types[0] == s16 || Query.Types[0] == v4s16 || @@ -719,26 +725,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) (Query.Types[1] == s32 || Query.Types[1] == s64 || Query.Types[1] == v4s16 || Query.Types[1] == v8s16); }) - .widenScalarToNextPow2(1) - .clampScalar(1, s32, s64) - .widenScalarToNextPow2(0) - .clampScalarOrElt(0, MinFPScalar, s64) - .moreElementsToNextPow2(0) + .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) + .moreElementsToNextPow2(1) + .widenScalarOrEltToNextPow2OrMinSize(1) + .minScalar(1, s32) + .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32) .widenScalarIf( [=](const LegalityQuery &Query) { - return Query.Types[0].getScalarSizeInBits() < - Query.Types[1].getScalarSizeInBits(); + return Query.Types[1].getScalarSizeInBits() <= 64 && + Query.Types[0].getScalarSizeInBits() < + Query.Types[1].getScalarSizeInBits(); }, LegalizeMutations::changeElementSizeTo(0, 1)) .widenScalarIf( [=](const LegalityQuery &Query) { - return Query.Types[0].getScalarSizeInBits() > - Query.Types[1].getScalarSizeInBits(); + return Query.Types[0].getScalarSizeInBits() <= 64 && + Query.Types[0].getScalarSizeInBits() > + Query.Types[1].getScalarSizeInBits(); }, LegalizeMutations::changeElementSizeTo(1, 0)) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) - .clampMaxNumElements(0, s64, 2); + .clampMaxNumElements(0, s64, 2) + .libcallFor({{s16, s128}, + {s32, s128}, + {s64, s128}, + {s128, s128}, + {s128, s32}, + {s128, s64}}); // Control-flow getActionDefinitionsBuilder(G_BRCOND) @@ -1006,7 +1021,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v2s64, v2s64); getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}); + .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}) + .bitcastIf( + [=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() <= 128 && + Query.Types[1].getSizeInBits() <= 64; + }, + [=](const LegalityQuery &Query) { + const LLT DstTy = Query.Types[0]; + const LLT SrcTy = Query.Types[1]; + return std::pair( + 0, DstTy.changeElementSize(SrcTy.getSizeInBits()) + .changeElementCount( + DstTy.getElementCount().divideCoefficientBy( + SrcTy.getNumElements()))); + }); getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 94e8e77b3c0525..dfc8eaea66f7b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1464,7 +1464,6 @@ def FeatureISAVersion10_Common : FeatureSet< FeatureLDSBankCount32, FeatureDLInsts, FeatureNSAEncoding, - FeatureWavefrontSize32, FeatureBackOffBarrier]>; def FeatureISAVersion10_1_Common : FeatureSet< @@ -1548,7 +1547,6 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureDot10Insts, FeatureNSAEncoding, FeaturePartialNSAEncoding, - FeatureWavefrontSize32, FeatureShaderCyclesRegister, FeatureArchitectedFlatScratch, FeatureAtomicFaddRtnInsts, @@ -1625,7 +1623,6 @@ def FeatureISAVersion12 : FeatureSet< FeatureDot11Insts, FeatureNSAEncoding, FeaturePartialNSAEncoding, - FeatureWavefrontSize32, FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 15df6216f89a4d..a41df9606749fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) { return Changed; } +static bool isLegalCrossLaneType(Type *Ty) { + switch (Ty->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + case Type::IntegerTyID: { + unsigned Size = Ty->getIntegerBitWidth(); + return (Size == 32 || Size == 64); + } + default: + return false; + } +} + void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // Early exit for unhandled address space atomic instructions. switch (I.getPointerAddressSpace()) { @@ -228,11 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if - // we have DPP available on our subtarget, and the atomic operation is 32 - // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { - return; + // we have DPP available on our subtarget (for DPP strategy), and the atomic + // operation is 32 or 64 bits. + if (ValDivergent) { + if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) + return; + + if (!isLegalCrossLaneType(I.getType())) + return; } // If we get here, we can optimize the atomic using a single wavefront-wide @@ -311,11 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if - // we have DPP available on our subtarget, and the atomic operation is 32 - // bits. - if (ValDivergent && - (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { - return; + // we have DPP available on our subtarget (for DPP strategy), and the atomic + // operation is 32 or 64 bits. + if (ValDivergent) { + if (ScanImpl == ScanOptions::DPP && !ST->hasDPP()) + return; + + if (!isLegalCrossLaneType(I.getType())) + return; } // If any of the other arguments to the intrinsic are divergent, we can't @@ -748,7 +768,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // of each active lane in the wavefront. This will be our new value // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, {NewV, LastLaneIdx}); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 43bfd0f13f875a..9d3c9e1e2ef9f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -24,7 +24,7 @@ namespace llvm { void initializeCycleInfoWrapperPassPass(PassRegistry &); -} +} // namespace llvm using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index b113904ce242f9..3e1d1283dd485e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -260,7 +260,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA); } }; -} +} // anonymous namespace AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 1aaf514ae8f62c..26116bfa3c2feb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -47,8 +47,7 @@ static cl::opt VerifyHSAMetadata( "amdgpu-verify-hsa-metadata", cl::desc("Verify AMDGPU HSA Metadata")); -namespace AMDGPU { -namespace HSAMD { +namespace AMDGPU::HSAMD { //===----------------------------------------------------------------------===// // HSAMetadataStreamerV4 @@ -707,6 +706,5 @@ void MetadataStreamerMsgPackV6::emitVersion() { getRootMetadata("amdhsa.version") = Version; } -} // end namespace HSAMD -} // end namespace AMDGPU +} // end namespace AMDGPU::HSAMD } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 86f28a5057694d..74e67690d5e88b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -112,7 +112,7 @@ class InstructionRule { virtual ~InstructionRule() = default; }; -typedef DenseMap> SUnitsToCandidateSGsMap; +using SUnitsToCandidateSGsMap = DenseMap>; // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel @@ -261,8 +261,8 @@ static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { S.getSUnit()->removePred(SP); } -typedef std::pair> SUToCandSGsPair; -typedef SmallVector SUsToCandSGsVec; +using SUToCandSGsPair = std::pair>; +using SUsToCandSGsVec = SmallVector; // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline // in non-trivial cases. For example, if the requested pipeline is @@ -311,7 +311,7 @@ class PipelineSolver { uint64_t BranchesExplored = 0; // The direction in which we process the candidate SchedGroups per SU - bool IsBottomUp = 1; + bool IsBottomUp = true; // Update indices to fit next conflicting instruction void advancePosition(); @@ -365,7 +365,7 @@ class PipelineSolver { PipelineSolver(DenseMap> &SyncedSchedGroups, DenseMap &SyncedInstrs, - ScheduleDAGMI *DAG, bool IsBottomUp = 1) + ScheduleDAGMI *DAG, bool IsBottomUp = true) : DAG(DAG), SyncedInstrs(SyncedInstrs), SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { @@ -858,7 +858,7 @@ class IGLPStrategy { virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, AMDGPU::SchedulingPhase Phase) = 0; - bool IsBottomUp = 1; + bool IsBottomUp = true; IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : DAG(DAG), TII(TII) {} @@ -881,7 +881,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy { MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { - IsBottomUp = 1; + IsBottomUp = true; } }; @@ -1350,7 +1350,7 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy { MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { - IsBottomUp = 0; + IsBottomUp = false; } }; @@ -2061,7 +2061,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { - IsBottomUp = 0; + IsBottomUp = false; } }; @@ -2371,7 +2371,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { // created SchedGroup first, and will consider that as the ultimate // predecessor group when linking. TOP_DOWN instead links and processes the // first created SchedGroup first. - bool IsBottomUp = 1; + bool IsBottomUp = true; // The scheduling phase this application of IGLP corresponds with. AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 26426575aeed30..d4b87d85a7c20c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5839,7 +5839,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( if (Tmp0 == 1) return 1; // Early out. - return std::min(Tmp0, std::min(Tmp1, Tmp2)); + return std::min({Tmp0, Tmp1, Tmp2}); } default: return 1; @@ -5876,7 +5876,7 @@ unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); if (Tmp0 == 1) return 1; - return std::min(Tmp0, std::min(Tmp1, Tmp2)); + return std::min({Tmp0, Tmp1, Tmp2}); } default: return 1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 217da6d2aa0ab6..2cc95f81d2f94d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -365,25 +365,13 @@ bool LiveRegOptimizer::optimizeLiveType( else MissingIncVal = true; } - + Instruction *DeadInst = Phi; if (MissingIncVal) { - Value *DeadVal = ValMap[Phi]; - // The coercion chain of the PHI is broken. Delete the Phi - // from the ValMap and any connected / user Phis. - SmallVector PHIWorklist; - PHIWorklist.push_back(DeadVal); - while (!PHIWorklist.empty()) { - Value *NextDeadValue = PHIWorklist.pop_back_val(); - ValMap.erase(NextDeadValue); - DeadInsts.emplace_back(cast(NextDeadValue)); - - for (User *U : NextDeadValue->users()) { - if (ValMap.contains(cast(U))) - PHIWorklist.push_back(U); - } - } - } else - DeadInsts.emplace_back(cast(Phi)); + DeadInst = cast(ValMap[Phi]); + // Do not use the dead phi + ValMap[Phi] = Phi; + } + DeadInsts.emplace_back(DeadInst); } // Coerce back to the original type and replace the uses. for (Instruction *U : Uses) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 456f3cb332cf83..30c5e5eebfcdc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -55,7 +55,7 @@ class AMDGPULibCalls { AssumptionCache *AC = nullptr; DominatorTree *DT = nullptr; - typedef llvm::AMDGPULibFunc FuncInfo; + using FuncInfo = llvm::AMDGPULibFunc; bool UnsafeFPMath = false; @@ -136,7 +136,7 @@ class AMDGPULibCalls { } public: - AMDGPULibCalls() {} + AMDGPULibCalls() = default; bool fold(CallInst *CI); @@ -147,7 +147,7 @@ class AMDGPULibCalls { bool useNative(CallInst *CI); }; -} // end llvm namespace +} // end namespace llvm template static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, @@ -899,7 +899,7 @@ static double log2(double V) { return log(V) / numbers::ln2; #endif } -} +} // namespace llvm bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 3437b6dc8ae0ca..dbc9233b72def4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -367,11 +367,11 @@ static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id, class ParamIterator { const AMDGPULibFunc::Param (&Leads)[2]; const ManglingRule& Rule; - int Index; + int Index = 0; public: ParamIterator(const AMDGPULibFunc::Param (&leads)[2], const ManglingRule& rule) - : Leads(leads), Rule(rule), Index(0) {} + : Leads(leads), Rule(rule) {} AMDGPULibFunc::Param getNextParam(); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 470180f2bcd281..6be9be21a8a861 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -700,7 +700,7 @@ class SplitPtrStructs : public InstVisitor { // Subtarget info, needed for determining what cache control bits to set. const TargetMachine *TM; - const GCNSubtarget *ST; + const GCNSubtarget *ST = nullptr; IRBuilder<> IRB; @@ -740,7 +740,7 @@ class SplitPtrStructs : public InstVisitor { public: SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) - : TM(TM), ST(nullptr), IRB(Ctx) {} + : TM(TM), IRB(Ctx) {} void processFunction(Function &F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 2bdbf4151dd954..a295117de6414d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -281,7 +281,7 @@ class AMDGPULowerModuleLDS { // immediately used by the kernel must still be allocated by it. An // equivalent target specific intrinsic which lasts until immediately after // codegen would suffice for that, but one would still need to ensure that - // the variables are allocated in the anticpated order. + // the variables are allocated in the anticipated order. BasicBlock *Entry = &Func->getEntryBlock(); IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); @@ -545,7 +545,7 @@ class AMDGPULowerModuleLDS { static std::vector assignLDSKernelIDToEachKernel( Module *M, DenseSet const &KernelsThatAllocateTableLDS, DenseSet const &KernelsThatIndirectlyAllocateDynamicLDS) { - // Associate kernels in the set with an arbirary but reproducible order and + // Associate kernels in the set with an arbitrary but reproducible order and // annotate them with that order in metadata. This metadata is recognised by // the backend and lowered to a SGPR which can be read from using // amdgcn_lds_kernel_id. @@ -1087,7 +1087,7 @@ class AMDGPULowerModuleLDS { raw_string_ostream SS{Buffer}; SS << format("%u", Offset); - // Instead of explictly marking kernels that access dynamic variables + // Instead of explicitly marking kernels that access dynamic variables // using special case metadata, annotate with min-lds == max-lds, i.e. // that there is no more space available for allocating more static // LDS variables. That is the right condition to prevent allocating @@ -1173,7 +1173,7 @@ class AMDGPULowerModuleLDS { LayoutFields.reserve(LDSVarsToTransform.size()); { // The order of fields in this struct depends on the order of - // varables in the argument which varies when changing how they + // variables in the argument which varies when changing how they // are identified, leading to spurious test breakage. auto Sorted = sortByName(std::vector( LDSVarsToTransform.begin(), LDSVarsToTransform.end())); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 31777295b4f8fe..99b4fca20bb2da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -44,8 +44,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())), IsModuleEntryFunction( AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())), - IsChainFunction(AMDGPU::isChainCC(F.getCallingConv())), - NoSignedZerosFPMath(false) { + IsChainFunction(AMDGPU::isChainCC(F.getCallingConv())) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index a9f1e9bd099635..1213d5e0b41db1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -68,7 +68,7 @@ struct AMDGPUPerfHint { public: AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, const TargetLowering *TLI_) - : FIM(FIM_), DL(nullptr), TLI(TLI_) {} + : FIM(FIM_), TLI(TLI_) {} bool runOnFunction(Function &F); @@ -95,7 +95,7 @@ struct AMDGPUPerfHint { AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; - const DataLayout *DL; + const DataLayout *DL = nullptr; const TargetLowering *TLI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index c97a40976bd2be..42a6bac4fa6f24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -34,7 +34,7 @@ using namespace llvm; #define DEBUG_TYPE "printfToRuntime" -#define DWORD_ALIGN 4 +enum { DWORD_ALIGN = 4 }; namespace { class AMDGPUPrintfRuntimeBinding final : public ModulePass { @@ -50,7 +50,7 @@ class AMDGPUPrintfRuntimeBinding final : public ModulePass { class AMDGPUPrintfRuntimeBindingImpl { public: - AMDGPUPrintfRuntimeBindingImpl() {} + AMDGPUPrintfRuntimeBindingImpl() = default; bool run(Module &M); private: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 9e7694f41d6b8f..17413ab55536da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -111,7 +111,7 @@ class ApplyRegBankMapping final : public GISelChangeObserver { B.setChangeObserver(*this); } - ~ApplyRegBankMapping() { + ~ApplyRegBankMapping() override { for (MachineInstr *MI : NewInsts) applyBank(*MI); @@ -199,7 +199,7 @@ class ApplyRegBankMapping final : public GISelChangeObserver { } }; -} +} // anonymous namespace AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index 9d44b65d1698ca..e72cfcb5bb038f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -28,7 +28,7 @@ using namespace llvm; namespace llvm { extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures - 1]; -} +} // namespace llvm namespace { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 2fe9cd242ff19b..3bf72d1a5d40aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -65,9 +65,10 @@ static const Function *getCalleeFunction(const MachineOperand &Op) { assert(Op.getImm() == 0); return nullptr; } - if (auto *GA = dyn_cast(Op.getGlobal())) - return cast(GA->getOperand(0)); - return cast(Op.getGlobal()); + const GlobalValue *GV = Op.getGlobal(); + while (auto *GA = dyn_cast(GV)) + GV = cast(GA->getOperand(0)); + return cast(GV); } static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 3e5d83b8e3fb10..f75961f6eaa775 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -256,11 +256,12 @@ calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M, } CostType FnCost = (ModuleCost - KernelCost); + CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1; SML << "=> Total Module Cost: " << ModuleCost << '\n' << " => KernelCost: " << KernelCost << " (" - << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n" + << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n" << " => FnsCost: " << FnCost << " (" - << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n"; + << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n"; return ModuleCost; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 21fe1bc31a27e4..49af0025afa9c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -105,6 +105,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, : AMDGPUSubtarget::SOUTHERN_ISLANDS; } + if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && + !hasFeature(AMDGPU::FeatureWavefrontSize64)) { + // If there is no default wave size it must be a generation before gfx10, + // these have FeatureWavefrontSize64 in their definition already. For gfx10+ + // set wave32 as a default. + ToggleFeature(AMDGPU::FeatureWavefrontSize32); + } + // We don't support FP64 for EG/NI atm. assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); @@ -175,7 +183,7 @@ void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { } } -AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {} +AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} bool AMDGPUSubtarget::useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index e2d8b5d1ce9790..49ccd2c9ae5112 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -71,7 +71,7 @@ class AMDGPUSubtarget { char WavefrontSizeLog2 = 0; public: - AMDGPUSubtarget(const Triple &TT); + AMDGPUSubtarget(Triple TT); static const AMDGPUSubtarget &get(const MachineFunction &MF); static const AMDGPUSubtarget &get(const TargetMachine &TM, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9ddf0a310ed061..192996f84c5f37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -177,7 +177,7 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR( static VGPRRegisterRegAlloc fastRegAllocVGPR( "fast", "fast register allocator", createFastVGPRRegisterAllocator); -} +} // anonymous namespace static cl::opt EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b08957d22ee74e..1d43043308ed96 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -947,7 +947,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isEndpgm() const; auto getPredicate(std::function P) const { - return std::bind(P, *this); + return [=](){ return P(*this); }; } StringRef getToken() const { @@ -1408,6 +1408,15 @@ class AMDGPUAsmParser : public MCTargetAsmParser { copySTI().ToggleFeature("southern-islands"); } + FeatureBitset FB = getFeatureBits(); + if (!FB[AMDGPU::FeatureWavefrontSize64] && + !FB[AMDGPU::FeatureWavefrontSize32]) { + // If there is no default wave size it must be a generation before gfx10, + // these have FeatureWavefrontSize64 in their definition already. For + // gfx10+ set wave32 as a default. + copySTI().ToggleFeature(AMDGPU::FeatureWavefrontSize32); + } + setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 695b2f246a778d..3e7b6ab19dd0c6 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -45,10 +45,26 @@ using namespace llvm; using DecodeStatus = llvm::MCDisassembler::DecodeStatus; +static const MCSubtargetInfo &addDefaultWaveSize(const MCSubtargetInfo &STI, + MCContext &Ctx) { + if (!STI.hasFeature(AMDGPU::FeatureWavefrontSize64) && + !STI.hasFeature(AMDGPU::FeatureWavefrontSize32)) { + MCSubtargetInfo &STICopy = Ctx.getSubtargetCopy(STI); + // If there is no default wave size it must be a generation before gfx10, + // these have FeatureWavefrontSize64 in their definition already. For gfx10+ + // set wave32 as a default. + STICopy.ToggleFeature(AMDGPU::FeatureWavefrontSize32); + return STICopy; + } + + return STI; +} + AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) - : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), + : MCDisassembler(addDefaultWaveSize(STI, Ctx), Ctx), MCII(MCII), + MRI(*Ctx.getRegisterInfo()), MAI(*Ctx.getAsmInfo()), + TargetMaxInstBytes(MAI.getMaxInstLength(&STI)), CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) { // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index 1dda1b89b2d36c..7a25a90d8bb053 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -43,7 +43,7 @@ class GCNCreateVOPD : public MachineFunctionPass { private: class VOPDCombineInfo { public: - VOPDCombineInfo() {} + VOPDCombineInfo() = default; VOPDCombineInfo(MachineInstr *First, MachineInstr *Second) : FirstMI(First), SecondMI(Second) {} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 4fd9ed2a89279a..a402fc6d7e6110 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -446,10 +446,10 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult; +using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; -typedef function_ref IsExpiredFn; -typedef function_ref GetNumWaitStatesFn; +using IsExpiredFn = function_ref; +using GetNumWaitStatesFn = function_ref; // Search for a hazard in a block and its predecessors. template diff --git a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp index 559dd0ed0c4134..5926abca12449b 100644 --- a/llvm/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/llvm/lib/Target/AMDGPU/GCNILPSched.cpp @@ -27,7 +27,7 @@ class GCNILPScheduler { }; SpecificBumpPtrAllocator Alloc; - typedef simple_ilist Queue; + using Queue = simple_ilist; Queue PendingQueue; Queue AvailQueue; unsigned CurQueueId = 0; @@ -359,4 +359,4 @@ std::vector makeGCNILPScheduler(ArrayRef BotRoots, GCNILPScheduler S; return S.schedule(BotRoots, DAG); } -} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index aebfe154b31395..061b0515031b1b 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -24,9 +24,9 @@ namespace llvm { std::vector makeMinRegSchedule(ArrayRef TopRoots, const ScheduleDAG &DAG); - std::vector makeGCNILPScheduler(ArrayRef BotRoots, - const ScheduleDAG &DAG); -} +std::vector makeGCNILPScheduler(ArrayRef BotRoots, + const ScheduleDAG &DAG); +} // namespace llvm // shim accessors for different order containers static inline MachineInstr *getMachineInstr(MachineInstr *MI) { @@ -383,7 +383,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, // Schedule consisting of MachineInstr* is considered 'detached' // and already interleaved with debug values - if (!std::is_same::value) { + if (!std::is_same_v) { placeDebugValues(); // Unfortunately placeDebugValues incorrectly modifies RegionEnd, restore // assert(R.End == RegionEnd); diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 90dbbf407d3dd8..d6395fd75924de 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -56,13 +56,13 @@ class GCNNSAReassign : public MachineFunctionPass { } private: - typedef enum { + using NSA_Status = enum { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try // to optimize. CONTIGUOUS // NSA with all sequential address registers - } NSA_Status; + }; const GCNSubtarget *ST; diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp index 6f83804f0a6f4d..90169b1cb3df9b 100644 --- a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp @@ -88,7 +88,7 @@ class GCNRewritePartialRegUses : public MachineFunctionPass { }; /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container. - typedef SmallDenseMap SubRegMap; + using SubRegMap = SmallDenseMap; /// Given register class RC and the set of used subregs as keys in the SubRegs /// map return new register class and indexes of right-shifted subregs as diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index c8ce1903d31537..ed9c48ff9c4de4 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -13,13 +13,12 @@ #include "AMDGPUCustomBehaviour.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPUBaseInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/WithColor.h" -namespace llvm { -namespace mca { +namespace llvm::mca { void AMDGPUInstrPostProcess::postProcessInstruction( std::unique_ptr &Inst, const MCInst &MCI) { @@ -332,8 +331,7 @@ bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); } -} // namespace mca -} // namespace llvm +} // namespace llvm::mca using namespace llvm; using namespace mca; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index 4e9a33227a5dcb..709cd15d9b9e4a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -26,7 +26,7 @@ class AMDGPUELFStreamer : public MCELFStreamer { std::move(Emitter)) {} }; -} +} // anonymous namespace MCELFStreamer * llvm::createAMDGPUELFStreamer(const Triple &T, MCContext &Context, diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index c8152c1f920df4..3aa8dd8c521629 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -24,7 +24,7 @@ using namespace llvm; #define DEBUG_TYPE "structcfg" -#define DEFAULT_VEC_SLOTS 8 +enum { DEFAULT_VEC_SLOTS = 8 }; // TODO: move-begin. diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 68c5f23c8e11f3..d43100254bfc90 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -124,7 +124,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { SmallVector RegSequences; SmallVector PHINodes; SmallVector S2VCopies; - unsigned NextVGPRToSGPRCopyID; + unsigned NextVGPRToSGPRCopyID = 0; MapVector V2SCopies; DenseMap> SiblingPenalty; @@ -135,7 +135,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { const SIRegisterInfo *TRI; const SIInstrInfo *TII; - SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} + SIFixSGPRCopies() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; void fixSCCCopies(MachineFunction &MF); diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 70ea5b8916155a..0d3a221970bf85 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -32,7 +32,7 @@ MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15), namespace { class SIFormMemoryClauses : public MachineFunctionPass { - typedef DenseMap> RegUse; + using RegUse = DenseMap>; public: static char ID; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a733295d2a511e..bb8e21772e5662 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4814,14 +4814,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, .addReg(PhiReg) .add(*Val) .addReg(SGPRIdxReg) - .addImm(AMDGPU::sub0); + .addImm(SubReg); } else { const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( TRI.getRegSizeInBits(*VecRC), 32, false); BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) .addReg(PhiReg) .add(*Val) - .addImm(AMDGPU::sub0); + .addImm(SubReg); } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 00eb4d60521ea9..a18da72b02ebed 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -451,7 +451,7 @@ class WaitcntGenerator { bool OptNone; public: - WaitcntGenerator() {} + WaitcntGenerator() = default; WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) : ST(&MF.getSubtarget()), TII(ST->getInstrInfo()), IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), @@ -510,7 +510,7 @@ class WaitcntGenerator { class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { public: - WaitcntGeneratorPreGFX12() {} + WaitcntGeneratorPreGFX12() = default; WaitcntGeneratorPreGFX12(const MachineFunction &MF) : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} @@ -540,12 +540,12 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { return WaitEventMaskForInstPreGFX12; } - virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; + AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { public: - WaitcntGeneratorGFX12Plus() {} + WaitcntGeneratorGFX12Plus() = default; WaitcntGeneratorGFX12Plus(const MachineFunction &MF, InstCounterType MaxCounter) : WaitcntGenerator(MF, MaxCounter) {} @@ -575,7 +575,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { return WaitEventMaskForInstGFX12Plus; } - virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; + AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class SIInsertWaitcnts : public MachineFunctionPass { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index cc1b9ac0c9ecda..ba72152f5668e2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -40,15 +40,12 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenInstrInfo.inc" -namespace llvm { -namespace AMDGPU { +namespace llvm::AMDGPU { #define GET_D16ImageDimIntrinsics_IMPL #define GET_ImageDimIntrinsicTable_IMPL #define GET_RsrcIntrinsics_IMPL #include "AMDGPUGenSearchableTables.inc" -} -} - +} // namespace llvm::AMDGPU // Must be at least 4 to be able to branch over minimum unconditional branch // code. This is only for making it possible to write reasonably small tests for diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d9db0f7a4f531e..bf8e61b554faec 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -26,7 +26,7 @@ #include #include -#define MAX_LANES 64 +enum { MAX_LANES = 64 }; using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 4476adf95f8d37..6550f98018aa44 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "SIMachineScheduler.h" -#include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -135,8 +135,7 @@ static const char *getReasonStr(SIScheduleCandReason Reason) { #endif -namespace llvm { -namespace SISched { +namespace llvm::SISched { static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, @@ -170,8 +169,7 @@ static bool tryGreater(int TryVal, int CandVal, Cand.setRepeat(Reason); return false; } -} // end namespace SISched -} // end namespace llvm +} // end namespace llvm::SISched // SIScheduleBlock // diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 24f8788683ed7f..452dac4b009932 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -609,6 +609,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; + bool setAtomicScope(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} @@ -625,6 +628,28 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsLastUse) const override; bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return setAtomicScope(MI, Scope, AddrSpace); + } }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -723,7 +748,7 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; } -} // end namespace anonymous +} // end anonymous namespace void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, const char *Msg) const { @@ -2429,6 +2454,72 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } +bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // The scratch address space does not need the global memory cache + // writeback as all memory operations by the same thread are + // sequentially consistent, and no other thread can access scratch + // memory. + + // Other address spaces do not have a cache. + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) + return false; + + if (Pos == Position::AFTER) + ++MI; + + // GLOBAL_WB is always needed, even for write-through caches, as it + // additionally ensures all operations have reached the desired cache level. + bool SkipWB = false; + AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; + switch (Scope) { + case SIAtomicScope::SYSTEM: + ScopeImm = AMDGPU::CPol::SCOPE_SYS; + break; + case SIAtomicScope::AGENT: + ScopeImm = AMDGPU::CPol::SCOPE_DEV; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to ensure all operations have reached L1, + // hence the SCOPE_SE WB. + // For CU mode, we need operations to reach L0, so the wait is enough - + // there are no ways for an operation to report completion without reaching + // at least L0. + if (ST.isCuModeEnabled()) + SkipWB = true; + else + ScopeImm = AMDGPU::CPol::SCOPE_SE; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + return false; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + if (!SkipWB) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm); + + if (Pos == Position::AFTER) + --MI; + + // We always have to wait for previous memory operations (load/store) to + // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), + // we of course need to wait for that as well. + insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos); + + return true; +} + bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { @@ -2479,6 +2570,44 @@ bool SIGfx12CacheControl::expandSystemScopeStore( return false; } +bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + break; + case SIAtomicScope::AGENT: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); + break; + case SIAtomicScope::WORKGROUP: + // In workgroup mode, SCOPE_SE is needed as waves can executes on + // different CUs that access different L0s. + if (!ST.isCuModeEnabled()) + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + // The scratch address space does not need the global memory caches + // to be bypassed as all memory operations by the same thread are + // sequentially consistent, and no other thread can access scratch + // memory. + + // Other address spaces do not have a cache. + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index ae91cb31590cfd..19e761ef45b25b 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -550,8 +550,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Opcode == AMDGPU::DS_DIRECT_LOAD) { // Mark these STRICTWQM, but only for the instruction, not its operands. // This avoid unnecessarily marking M0 as requiring WQM. - InstrInfo &II = Instructions[&MI]; - II.Needs |= StateStrictWQM; + III.Needs |= StateStrictWQM; GlobalFlags |= StateStrictWQM; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 3af536dac473e1..5f7549c2921eda 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -9,8 +9,7 @@ #include "AMDGPUBaseInfo.h" #include "SIDefines.h" -namespace llvm { -namespace AMDGPU { +namespace llvm::AMDGPU { //===----------------------------------------------------------------------===// // Custom Operands. @@ -684,5 +683,4 @@ ArrayRef getGFXVersions() { } // namespace UCVersion -} // namespace AMDGPU -} // namespace llvm +} // namespace llvm::AMDGPU diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1b3cc4a83bea3b..bb5f2328129f91 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -153,7 +153,7 @@ inline unsigned getSaSdstBitWidth() { return 1; } /// \returns SaSdst bit shift inline unsigned getSaSdstBitShift() { return 0; } -} // end namespace anonymous +} // end anonymous namespace namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index abe0ce375aedd4..4cda8b2813709c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -25,9 +25,7 @@ using namespace llvm; -namespace llvm { - -namespace AMDGPU { +namespace llvm::AMDGPU { Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), @@ -371,6 +369,4 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, return false; } -} // end namespace AMDGPU - -} // end namespace llvm +} // end namespace llvm::AMDGPU diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 61d2928fe6d412..1cc21da51d1e67 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -40,7 +40,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2683b5741d4597..75d16a42d0205a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -53,7 +53,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h index d6ce4eb1055b48..9e10638233e6d1 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h @@ -16,7 +16,7 @@ #include "llvm/ADT/IndexedMap.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/IR/Instructions.h" namespace llvm { diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h index ffa8b50493510d..275b1c0f8dc017 100644 --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -14,7 +14,7 @@ #define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H #include "MCTargetDesc/ARMAddressingModes.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" namespace llvm { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 09fe5264e09553..79cffc0da7a4f2 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -27,7 +27,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetCallingConv.h" #include "llvm/CodeGen/ValueTypes.h" @@ -39,12 +39,12 @@ #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsHexagon.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 8aef45e401d43b..06fd7ac807d8ab 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -27,7 +27,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 5c4fddd5116eaa..79da36c03e3046 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -22,7 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsLoongArch.h" diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 7d8adb9a003114..ef70ef27726814 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -40,7 +40,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetFrameLowering.h" diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 380d878c1f532e..a004d64c21cc69 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -227,9 +227,33 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, if (Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); - if (!strcmp(Modifier, "volatile")) { - if (Imm) + if (!strcmp(Modifier, "sem")) { + switch (Imm) { + case NVPTX::PTXLdStInstCode::NotAtomic: + break; + case NVPTX::PTXLdStInstCode::Volatile: O << ".volatile"; + break; + case NVPTX::PTXLdStInstCode::Relaxed: + O << ".relaxed.sys"; + break; + case NVPTX::PTXLdStInstCode::Acquire: + O << ".acquire.sys"; + break; + case NVPTX::PTXLdStInstCode::Release: + O << ".release.sys"; + break; + case NVPTX::PTXLdStInstCode::RelaxedMMIO: + O << ".mmio.relaxed.sys"; + break; + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX LdStCode Printer does not support \"" << Imm + << "\" sem modifier."; + report_fatal_error(OS.str()); + break; + } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { case NVPTX::PTXLdStInstCode::GLOBAL: diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index b0cb24c63c3ceb..3c7167b1570254 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -107,6 +107,14 @@ enum LoadStore { }; namespace PTXLdStInstCode { +enum MemorySemantic { + NotAtomic = 0, // PTX calls these: "Weak" + Volatile = 1, + Relaxed = 2, + Acquire = 3, + Release = 4, + RelaxedMMIO = 5 +}; enum AddressSpace { GENERIC = 0, GLOBAL = 1, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 11193c11ede3b1..371ec8596ef637 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -714,6 +714,170 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } +static unsigned int getCodeMemorySemantic(MemSDNode *N, + const NVPTXSubtarget *Subtarget) { + AtomicOrdering Ordering = N->getSuccessOrdering(); + auto CodeAddrSpace = getCodeAddrSpace(N); + + bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); + bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); + + // TODO: lowering for SequentiallyConsistent Operations: for now, we error. + // TODO: lowering for AcquireRelease Operations: for now, we error. + // + + // clang-format off + + // Lowering for non-SequentiallyConsistent Operations + // + // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | + // |---------|----------|--------------------|------------|------------------------------| + // | No | No | All | plain | .weak | + // | No | Yes | Generic,Shared, | .volatile | .volatile | + // | | | Global [0] | | | + // | No | Yes | Local,Const,Param | plain [1] | .weak [1] | + // | Relaxed | No | Generic,Shared, | | | + // | | | Global [0] | .volatile | | + // | Other | No | Generic,Shared, | Error [2] | | + // | | | Global [0] | | | + // | Yes | No | Local,Const,Param | plain [1] | .weak [1] | + // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile | + // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) | + // | | | | | or .volatile (PTX 8.1-) | + // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] | + // | Other | Yes | Generic, Shared, | Error [2] | [3] | + // | | | / Global [0] | | | + + // clang-format on + + // [0]: volatile and atomics are only supported on global or shared + // memory locations, accessed via generic/shared/global pointers. + // MMIO is only supported on global memory locations, + // accessed via generic/global pointers. + // TODO: Implement MMIO access via generic pointer to global. + // Currently implemented for global pointers only. + + // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic + // PTX instructions fails to preserve their C++ side-effects. + // + // Example (https://github.com/llvm/llvm-project/issues/62057): + // + // void example() { + // std::atomic True = true; + // while (True.load(std::memory_order_relaxed)); + // } + // + // A C++ program that calls "example" is well-defined: the infinite loop + // performs an atomic operation. By lowering volatile/atomics to + // "weak" memory operations, we are transforming the above into: + // + // void undefined_behavior() { + // bool True = true; + // while (True); + // } + // + // which exhibits undefined behavior in both C++ and PTX. + // + // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined + // behavior due to lack of Independent Forward Progress. Lowering these + // to weak memory operations in sm_60- is therefore fine. + // + // TODO: lower atomic and volatile operations to memory locations + // in local, const, and param to two PTX instructions in sm_70+: + // - the "weak" memory instruction we are currently lowering to, and + // - some other instruction that preserves the side-effect, e.g., + // a dead dummy volatile load. + + if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || + CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || + CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { + return NVPTX::PTXLdStInstCode::NotAtomic; + } + + // [2]: Atomics with Ordering different than Relaxed are not supported on + // sm_60 and older; this includes volatile atomics. + if (!(Ordering == AtomicOrdering::NotAtomic || + Ordering == AtomicOrdering::Monotonic) && + !HasMemoryOrdering) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX does not support \"atomic\" for orderings different than" + "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \"" + << toIRString(Ordering) << "\"."; + report_fatal_error(OS.str()); + } + + // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop + // the volatile semantics and preserve the atomic ones. + + // PTX volatile and PTX atomics are not available for statespace that differ + // from .generic, .global, or .shared. The behavior of PTX volatile and PTX + // atomics is undefined if the generic address does not refer to a .global or + // .shared memory location. + bool AddrGenericOrGlobalOrShared = + (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || + CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || + CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + bool UseRelaxedMMIO = + HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; + + switch (Ordering) { + case AtomicOrdering::NotAtomic: + return N->isVolatile() && AddrGenericOrGlobalOrShared + ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Monotonic: + if (N->isVolatile()) + return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO + : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + else + return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed + : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Acquire: + if (!N->readMem()) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports Acquire Ordering on reads: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::Release: + if (!N->writeMem()) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports Release Ordering on writes: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release + : NVPTX::PTXLdStInstCode::NotAtomic; + case AtomicOrdering::AcquireRelease: { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "PTX only supports AcquireRelease Ordering on read-modify-write: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + case AtomicOrdering::SequentiallyConsistent: + case AtomicOrdering::Unordered: + // TODO: support AcquireRelease and SequentiallyConsistent + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX backend does not support AtomicOrdering \"" + << toIRString(Ordering) << "\" yet."; + report_fatal_error(OS.str()); + } + + llvm_unreachable("unexpected unhandled case"); +} + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address @@ -916,32 +1080,18 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (!LoadedVT.isSimple()) return false; - AtomicOrdering Ordering = LD->getSuccessOrdering(); - // In order to lower atomic loads with stronger guarantees we would need to - // use load.acquire or insert fences. However these features were only added - // with PTX ISA 6.0 / sm_70. - // TODO: Check if we can actually use the new instructions and implement them. - if (isStrongerThanMonotonic(Ordering)) - return false; - // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(LD); if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget); + unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); - // Volatile Setting - // - .volatile is only available for .global and .shared - // - .volatile has the same memory synchronization semantics as .relaxed.sys - bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - isVolatile = false; - // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD @@ -982,9 +1132,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Addr, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Addr, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { @@ -993,9 +1147,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { @@ -1010,9 +1169,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + Base, + Offset, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } else { if (PointerSize == 64) @@ -1026,9 +1190,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), N1, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + getI32Imm(CodeAddrSpace, dl), + getI32Imm(vecType, dl), + getI32Imm(fromType, dl), + getI32Imm(fromTypeWidth, dl), + N1, + Chain}; NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); } @@ -1065,13 +1233,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool IsVolatile = MemSD->isVolatile(); - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - IsVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1138,9 +1301,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Addr, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Addr, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) @@ -1163,9 +1330,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Base, + Offset, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) @@ -1208,9 +1380,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Base, + Offset, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } else { @@ -1253,9 +1430,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), Op1, Chain }; + SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL), + Op1, + Chain}; LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); } @@ -1698,27 +1879,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!StoreVT.isSimple()) return false; - AtomicOrdering Ordering = ST->getSuccessOrdering(); - // In order to lower atomic loads with stronger guarantees we would need to - // use store.release or insert fences. However these features were only added - // with PTX ISA 6.0 / sm_70. - // TODO: Check if we can actually use the new instructions and implement them. - if (isStrongerThanMonotonic(Ordering)) - return false; - // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(ST); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); - // Volatile Setting - // - .volatile is only available for .global and .shared - // - .volatile has the same memory synchronization semantics as .relaxed.sys - bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - isVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -1755,7 +1922,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1772,7 +1939,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1797,7 +1964,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1819,7 +1986,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(isVolatile, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1858,13 +2025,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Volatile Setting - // - .volatile is only availalble for .global and .shared - bool IsVolatile = MemSD->isVolatile(); - if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && - CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && - CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) - IsVolatile = false; + // Memory Semantic Setting + unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -1906,7 +2068,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(IsVolatile, DL)); + StOps.push_back(getI32Imm(CodeMemorySem, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); StOps.push_back(getI32Imm(ToType, DL)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 827febe845a4c7..7f1ac8688007ea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1150,6 +1150,18 @@ def DoubleConst1 : PatLeaf<(fpimm), [{ return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && N->getValueAPF().convertToDouble() == 1.0; }]>; +// Constant -1.0 (double) +def DoubleConstNeg1 : PatLeaf<(fpimm), [{ + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && + N->getValueAPF().convertToDouble() == -1.0; +}]>; + + +// Constant -X -> X (double) +def NegDoubleConst : SDNodeXFormgetTargetConstantFP(-(N->getValueAPF()), + SDLoc(N), MVT::f64); +}]>; // Loads FP16 constant into a register. // @@ -1225,6 +1237,11 @@ def FDIV64ri : "div.rn.f64 \t$dst, $a, $b;", [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; +// fdiv will be converted to rcp +// fneg (fdiv 1.0, X) => fneg (rcp.rn X) +def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b), + (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>; + // // F32 Approximate reciprocal // @@ -2941,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; } @@ -2989,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; } @@ -3040,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3123,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 3ca4c1a24c79a1..8df41913ff12ef 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -78,13 +78,18 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasAtomBitwise64() const { return SmVersion >= 32; } bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasLDG() const { return SmVersion >= 32; } - inline bool hasHWROT32() const { return SmVersion >= 32; } + bool hasHWROT32() const { return SmVersion >= 32; } bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool hasBF16Math() const { return SmVersion >= 80; } bool allowFP16Math() const; bool hasMaskOperator() const { return PTXVersion >= 71; } bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } + // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, + // release, acq_rel, sc) ? + bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } + // Does SM & PTX support atomic relaxed MMIO operations ? + bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 6aabf5cd8c5945..411114599543c9 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -48,7 +48,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index 535a54a1a9a3c0..b8abee76cdfa80 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1291,6 +1291,10 @@ bool PPCMIPeephole::simplifyCode() { addRegToUpdate(OrigOp1Reg); if (MI.getOperand(1).isReg()) addRegToUpdate(MI.getOperand(1).getReg()); + if (ToErase && ToErase->getOperand(1).isReg()) + for (auto UseReg : ToErase->explicit_uses()) + if (UseReg.isReg()) + addRegToUpdate(UseReg.getReg()); ++NumRotatesCollapsed; } break; diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 6e0f429c34b2f6..0e84eda0c9d07a 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -18,9 +18,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -35,6 +37,7 @@ namespace { class RISCVCodeGenPrepare : public FunctionPass, public InstVisitor { const DataLayout *DL; + const DominatorTree *DT; const RISCVSubtarget *ST; public: @@ -48,12 +51,14 @@ class RISCVCodeGenPrepare : public FunctionPass, void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); } bool visitInstruction(Instruction &I) { return false; } bool visitAnd(BinaryOperator &BO); bool visitIntrinsicInst(IntrinsicInst &I); + bool expandVPStrideLoad(IntrinsicInst &I); }; } // end anonymous namespace @@ -128,6 +133,9 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { // Which eliminates the scalar -> vector -> scalar crossing during instruction // selection. bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + if (expandVPStrideLoad(I)) + return true; + if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd) return false; @@ -155,6 +163,53 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { return true; } +// Always expand zero strided loads so we match more .vx splat patterns, even if +// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert +// it back to a strided load if it's optimized. +bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { + Value *BasePtr, *VL; + + using namespace PatternMatch; + if (!match(&II, m_Intrinsic( + m_Value(BasePtr), m_Zero(), m_AllOnes(), m_Value(VL)))) + return false; + + // If SEW>XLEN then a splat will get lowered as a zero strided load anyway, so + // avoid expanding here. + if (II.getType()->getScalarSizeInBits() > ST->getXLen()) + return false; + + if (!isKnownNonZero(VL, {*DL, DT, nullptr, &II})) + return false; + + auto *VTy = cast(II.getType()); + + IRBuilder<> Builder(&II); + + // Extend VL from i32 to XLen if needed. + if (ST->is64Bit()) + VL = Builder.CreateZExt(VL, Builder.getInt64Ty()); + + Type *STy = VTy->getElementType(); + Value *Val = Builder.CreateLoad(STy, BasePtr); + const auto &TLI = *ST->getTargetLowering(); + Value *Res; + + // TODO: Also support fixed/illegal vector types to splat with evl = vl. + if (isa(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) { + unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f + : Intrinsic::riscv_vmv_v_x; + Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, + {PoisonValue::get(VTy), Val, VL}); + } else { + Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val); + } + + II.replaceAllUsesWith(Res); + II.eraseFromParent(); + return true; +} + bool RISCVCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -164,6 +219,7 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) { ST = &TM.getSubtarget(F); DL = &F.getDataLayout(); + DT = &getAnalysis().getDomTree(); bool MadeChange = false; for (auto &BB : F) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index d3cb2aeab41cb2..5a8605aa4a1978 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -151,7 +151,7 @@ def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">, "'Zimop' (May-Be-Operations)">; def FeatureStdExtZicfilp - : RISCVExperimentalExtension<"zicfilp", 0, 4, + : RISCVExperimentalExtension<"zicfilp", 1, 0, "'Zicfilp' (Landing pad)", [FeatureStdExtZicsr]>; def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">, @@ -161,7 +161,7 @@ def NoStdExtZicfilp : Predicate<"!Subtarget->hasStdExtZicfilp()">, AssemblerPredicate<(all_of (not FeatureStdExtZicfilp))>; def FeatureStdExtZicfiss - : RISCVExperimentalExtension<"zicfiss", 0, 4, + : RISCVExperimentalExtension<"zicfiss", 1, 0, "'Zicfiss' (Shadow stack)", [FeatureStdExtZicsr, FeatureStdExtZimop]>; def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b8ba25df9910bb..8b5e56bff4097d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1894,6 +1894,21 @@ bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { return (SrcBits == 64 && DestBits == 32); } +bool RISCVTargetLowering::isTruncateFree(SDValue Val, EVT VT2) const { + EVT SrcVT = Val.getValueType(); + // free truncate from vnsrl and vnsra + if (Subtarget.hasStdExtV() && + (Val.getOpcode() == ISD::SRL || Val.getOpcode() == ISD::SRA) && + SrcVT.isVector() && VT2.isVector()) { + unsigned SrcBits = SrcVT.getVectorElementType().getSizeInBits(); + unsigned DestBits = VT2.getVectorElementType().getSizeInBits(); + if (SrcBits == DestBits * 2) { + return true; + } + } + return TargetLowering::isTruncateFree(Val, VT2); +} + bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // Zexts are free if they can be combined with a load. // Don't advertise i32->i64 zextload as being free for RV64. It interacts @@ -14303,6 +14318,13 @@ struct NodeExtensionHelper { case RISCVISD::VMV_V_X_VL: return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, DAG.getUNDEF(NarrowVT), Source.getOperand(1), VL); + case RISCVISD::VFMV_V_F_VL: + Source = Source.getOperand(1); + assert(Source.getOpcode() == ISD::FP_EXTEND && "Unexpected source"); + Source = Source.getOperand(0); + assert(Source.getValueType() == NarrowVT.getVectorElementType()); + return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, NarrowVT, + DAG.getUNDEF(NarrowVT), Source, VL); default: // Other opcodes can only come from the original LHS of VW(ADD|SUB)_W_VL // and that operand should already have the right NarrowVT so no @@ -14460,7 +14482,7 @@ struct NodeExtensionHelper { if (ScalarBits < EltBits) return; - unsigned NarrowSize = VT.getScalarSizeInBits() / 2; + unsigned NarrowSize = EltBits / 2; // If the narrow type cannot be expressed with a legal VMV, // this is not a valid candidate. if (NarrowSize < 8) @@ -14518,6 +14540,24 @@ struct NodeExtensionHelper { case RISCVISD::VMV_V_X_VL: fillUpExtensionSupportForSplat(Root, DAG, Subtarget); break; + case RISCVISD::VFMV_V_F_VL: { + MVT VT = OrigOperand.getSimpleValueType(); + + if (!OrigOperand.getOperand(0).isUndef()) + break; + + SDValue Op = OrigOperand.getOperand(1); + if (Op.getOpcode() != ISD::FP_EXTEND) + break; + + unsigned NarrowSize = VT.getScalarSizeInBits() / 2; + unsigned ScalarBits = Op.getOperand(0).getValueSizeInBits(); + if (NarrowSize != ScalarBits) + break; + + SupportsFPExt = true; + break; + } default: break; } @@ -15467,12 +15507,9 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineVFMADD_VLWithVFNEG_VL(N, DAG)) return V; - if (N->getValueType(0).isScalableVector() && - N->getValueType(0).getVectorElementType() == MVT::f32 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) { + if (N->getValueType(0).getVectorElementType() == MVT::f32 && + !Subtarget.hasVInstructionsF16()) return SDValue(); - } // FIXME: Ignore strict opcodes for now. if (N->isTargetStrictFPOpcode()) @@ -17235,10 +17272,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case RISCVISD::FMUL_VL: case RISCVISD::VFWADD_W_VL: case RISCVISD::VFWSUB_W_VL: { - if (N->getValueType(0).isScalableVector() && - N->getValueType(0).getVectorElementType() == MVT::f32 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (N->getValueType(0).getVectorElementType() == MVT::f32 && + !Subtarget.hasVInstructionsF16()) return SDValue(); return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 7d8bceb5cb3417..2642a188820e14 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -497,6 +497,7 @@ class RISCVTargetLowering : public TargetLowering { bool isLegalAddImmediate(int64_t Imm) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; + bool isTruncateFree(SDValue Val, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; bool signExtendConstant(const ConstantInt *CI) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index c74e7ac929c6b0..d5b3df48d53b47 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -950,12 +950,6 @@ void RISCVInsertVSETVLI::forwardVSETVLIAVL(VSETVLIInfo &Info) const { VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI); if (!DefInstrInfo.hasSameVLMAX(Info)) return; - // If the AVL is a register with multiple definitions, don't forward it. We - // might not be able to extend its LiveInterval without clobbering other val - // nums. - if (DefInstrInfo.hasAVLReg() && - !LIS->getInterval(DefInstrInfo.getAVLReg()).containsOneValue()) - return; Info.setAVL(DefInstrInfo); } @@ -1149,15 +1143,32 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, .addImm(Info.encodeVTYPE()); if (LIS) { LIS->InsertMachineInstrInMaps(*MI); - // Normally the AVL's live range will already extend past the inserted - // vsetvli because the pseudos below will already use the AVL. But this - // isn't always the case, e.g. PseudoVMV_X_S doesn't have an AVL operand or - // we've taken the AVL from the VL output of another vsetvli. LiveInterval &LI = LIS->getInterval(AVLReg); SlotIndex SI = LIS->getInstructionIndex(*MI).getRegSlot(); - assert((LI.liveAt(SI) && LI.getVNInfoAt(SI) == Info.getAVLVNInfo()) || - (!LI.liveAt(SI) && LI.containsOneValue())); - LIS->extendToIndices(LI, SI); + // If the AVL value isn't live at MI, do a quick check to see if it's easily + // extendable. Otherwise, we need to copy it. + if (LI.getVNInfoBefore(SI) != Info.getAVLVNInfo()) { + if (!LI.liveAt(SI) && LI.containsOneValue()) + LIS->extendToIndices(LI, SI); + else { + Register AVLCopyReg = + MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass); + MachineBasicBlock::iterator II; + if (Info.getAVLVNInfo()->isPHIDef()) + II = LIS->getMBBFromIndex(Info.getAVLVNInfo()->def)->getFirstNonPHI(); + else { + II = LIS->getInstructionFromIndex(Info.getAVLVNInfo()->def); + II = std::next(II); + } + assert(II.isValid()); + auto AVLCopy = + BuildMI(*II->getParent(), II, DL, TII->get(RISCV::COPY), AVLCopyReg) + .addReg(AVLReg); + LIS->InsertMachineInstrInMaps(*AVLCopy); + MI->getOperand(1).setReg(AVLCopyReg); + LIS->createAndComputeVirtRegInterval(AVLCopyReg); + } + } } } @@ -1633,6 +1644,24 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { Used.demandVL(); Used.demandVTYPE(); SmallVector ToDelete; + + // Update LIS and cleanup dead AVLs given a value which has + // has had one use (as an AVL) removed. + auto afterDroppedAVLUse = [&](Register OldVLReg) { + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(OldVLReg)); + + MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg); + if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) && + MRI->use_nodbg_empty(OldVLReg)) { + if (LIS) { + LIS->removeInterval(OldVLReg); + LIS->RemoveMachineInstrFromMaps(*VLOpDef); + } + VLOpDef->eraseFromParent(); + } + }; + for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { if (!isVectorConfigInstr(MI)) { @@ -1685,22 +1714,9 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { MI.getOperand(1).ChangeToImmediate(NextMI->getOperand(1).getImm()); else MI.getOperand(1).ChangeToRegister(NextMI->getOperand(1).getReg(), false); + if (OldVLReg && OldVLReg.isVirtual()) + afterDroppedAVLUse(OldVLReg); - if (OldVLReg && OldVLReg.isVirtual()) { - // MI no longer uses OldVLReg so shrink its LiveInterval. - if (LIS) - LIS->shrinkToUses(&LIS->getInterval(OldVLReg)); - - MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg); - if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) && - MRI->use_nodbg_empty(OldVLReg)) { - if (LIS) { - LIS->removeInterval(OldVLReg); - LIS->RemoveMachineInstrFromMaps(*VLOpDef); - } - VLOpDef->eraseFromParent(); - } - } MI.setDesc(NextMI->getDesc()); } MI.getOperand(2).setImm(NextMI->getOperand(2).getImm()); @@ -1720,8 +1736,8 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { if (MI->getOperand(1).isReg()) OldAVLReg = MI->getOperand(1).getReg(); MI->eraseFromParent(); - if (LIS && OldAVLReg && OldAVLReg.isVirtual()) - LIS->shrinkToUses(&LIS->getInterval(OldAVLReg)); + if (OldAVLReg && OldAVLReg.isVirtual()) + afterDroppedAVLUse(OldAVLReg); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index 79b960c6da21c4..3bd6da28682863 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -826,3 +826,36 @@ let Predicates = [HasVendorXCVbi, IsRV32], AddedComplexity = 2 in { def : Selectbi; def : Selectbi; } + +class PatCoreVMacGprGprGpr + : Pat<(!cast("int_riscv_cv_mac_" # intr) GPR:$rs1, GPR:$rs2, GPR:$rd), + (!cast("CV_" # asm) GPR:$rd, GPR:$rs1, GPR:$rs2)>; +class PatCoreVMacGprGprGprUimm5 + : Pat<(!cast("int_riscv_cv_mac_" # intr) GPR:$rs1, GPR:$rs2, GPR:$rd, cv_tuimm5:$imm5), + (!cast("CV_" # asm) GPR:$rd, GPR:$rs1, GPR:$rs2, cv_tuimm5:$imm5)>; +class PatCoreVMacGprGprUimm5 + : Pat<(!cast("int_riscv_cv_mac_" # intr) GPR:$rs1, GPR:$rs2, cv_tuimm5:$imm5), + (!cast("CV_" # asm) GPR:$rs1, GPR:$rs2, cv_tuimm5:$imm5)>; + +let Predicates = [HasVendorXCVmac] in { + def : PatCoreVMacGprGprGpr<"mac", "MAC">; + def : PatCoreVMacGprGprGpr<"msu", "MSU">; + + def : PatCoreVMacGprGprUimm5<"muluN", "MULUN">; + def : PatCoreVMacGprGprUimm5<"mulhhuN", "MULHHUN">; + def : PatCoreVMacGprGprUimm5<"mulsN", "MULSN">; + def : PatCoreVMacGprGprUimm5<"mulhhsN", "MULHHSN">; + def : PatCoreVMacGprGprUimm5<"muluRN", "MULURN">; + def : PatCoreVMacGprGprUimm5<"mulhhuRN", "MULHHURN">; + def : PatCoreVMacGprGprUimm5<"mulsRN", "MULSRN">; + def : PatCoreVMacGprGprUimm5<"mulhhsRN", "MULHHSRN">; + + def : PatCoreVMacGprGprGprUimm5<"macuN", "MACUN">; + def : PatCoreVMacGprGprGprUimm5<"machhuN", "MACHHUN">; + def : PatCoreVMacGprGprGprUimm5<"macsN", "MACSN">; + def : PatCoreVMacGprGprGprUimm5<"machhsN", "MACHHSN">; + def : PatCoreVMacGprGprGprUimm5<"macuRN", "MACURN">; + def : PatCoreVMacGprGprGprUimm5<"machhuRN", "MACHHURN">; + def : PatCoreVMacGprGprGprUimm5<"macsRN", "MACSRN">; + def : PatCoreVMacGprGprGprUimm5<"machhsRN", "MACHHSRN">; +} diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3e5272cb180a45..0a7229a2bc0fb3 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -6727,8 +6727,8 @@ SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG( if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) { SDLoc DL(N0); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), - DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT), - N0.getOperand(2) }; + DAG.getAllOnesConstant(DL, VT), + DAG.getConstant(0, DL, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } return SDValue(); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 2290a7d62e89fc..1e7285e3e0fc53 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -507,6 +507,8 @@ class SystemZTargetLowering : public TargetLowering { bool shouldConsiderGEPOffsetSplit() const override { return true; } + bool shouldExpandCmpUsingSelects() const override { return true; } + const char *getTargetNodeName(unsigned Opcode) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 3dba33b66bf4f4..9a12718db7cb95 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -363,7 +363,8 @@ class InstRIEe op, dag outs, dag ins, string asmstr, list pattern> let Inst{7-0} = op{7-0}; } -class InstRIEf op, dag outs, dag ins, string asmstr, list pattern> +class InstRIEf op, dag outs, dag ins, string asmstr, list pattern, + bits<8> I3Or = 0, bits<8> I4Or = 0> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; field bits<48> SoftFail = 0; @@ -377,8 +378,22 @@ class InstRIEf op, dag outs, dag ins, string asmstr, list pattern> let Inst{47-40} = op{15-8}; let Inst{39-36} = R1; let Inst{35-32} = R2; - let Inst{31-24} = I3; - let Inst{23-16} = I4; + let Inst{31} = !if(I3Or{7}, 1, I3{7}); + let Inst{30} = !if(I3Or{6}, 1, I3{6}); + let Inst{29} = !if(I3Or{5}, 1, I3{5}); + let Inst{28} = !if(I3Or{4}, 1, I3{4}); + let Inst{27} = !if(I3Or{3}, 1, I3{3}); + let Inst{26} = !if(I3Or{2}, 1, I3{2}); + let Inst{25} = !if(I3Or{1}, 1, I3{1}); + let Inst{24} = !if(I3Or{0}, 1, I3{0}); + let Inst{23} = !if(I4Or{7}, 1, I4{7}); + let Inst{22} = !if(I4Or{6}, 1, I4{6}); + let Inst{21} = !if(I4Or{5}, 1, I4{5}); + let Inst{20} = !if(I4Or{4}, 1, I4{4}); + let Inst{19} = !if(I4Or{3}, 1, I4{3}); + let Inst{18} = !if(I4Or{2}, 1, I4{2}); + let Inst{17} = !if(I4Or{1}, 1, I4{1}); + let Inst{16} = !if(I4Or{0}, 1, I4{0}); let Inst{15-8} = I5; let Inst{7-0} = op{7-0}; } @@ -2349,6 +2364,12 @@ class AsmCondBranchRR opcode> : InstRR; +class NeverCondBranchRR opcode> + : InstRR { + let R1 = 0; +} + class FixedCondBranchRR opcode, SDPatternOperator operator = null_frag> : InstRR opcode> (ins imm32zx4:$M1, (bdxaddr12only $B2, $D2, $X2):$XBD2), mnemonic#"\t$M1, $XBD2", []>; +class NeverCondBranchRX opcode> + : InstRXb { + let M1 = 0; +} + class FixedCondBranchRX opcode> : InstRXb { @@ -3439,6 +3467,19 @@ class BinaryRRFa opcode, SDPatternOperator operator, let OpType = "reg"; } + +class UnaryRRFa opcode, SDPatternOperator operator, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFa { + let R3 = R2; + let M4 = 0; + let OpKey = mnemonic#cls1; + let OpType = "reg"; +} + + multiclass BinaryRRAndK opcode1, bits<16> opcode2, SDPatternOperator operator, RegisterOperand cls1, RegisterOperand cls2> { @@ -4999,11 +5040,11 @@ multiclass CmpSwapRSPair rsOpcode, bits<16> rsyOpcode, } class RotateSelectRIEf opcode, RegisterOperand cls1, - RegisterOperand cls2> + RegisterOperand cls2, bits<8> I3Or = 0, bits<8> I4Or = 0> : InstRIEf { + mnemonic#"\t$R1, $R2, $I3, $I4, $I5", [], I3Or, I4Or> { let Constraints = "$R1 = $R1src"; let DisableEncoding = "$R1src"; } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 7c6ab3f9b1ab5f..7ab0b366363045 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -111,11 +111,11 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { // NOPs. These are again variants of the conditional branches, with the // condition mask set to "never". NOP_bare can't be an InstAlias since it // would need R0D hard coded which is not part of ADDR64BitRegClass. -def NOP : InstAlias<"nop\t$XBD", (BCAsm 0, bdxaddr12only:$XBD), 0>; +def NOP : NeverCondBranchRX<"nop", 0x47>; let isAsmParserOnly = 1, hasNoSchedulingInfo = 1, M1 = 0, X2 = 0, B2 = 0, D2 = 0 in def NOP_bare : InstRXb<0x47,(outs), (ins), "nop", []>; -def NOPR : InstAlias<"nopr\t$R", (BCRAsm 0, GR64:$R), 0>; -def NOPR_bare : InstAlias<"nopr", (BCRAsm 0, R0D), 0>; +def NOPR : NeverCondBranchRR<"nopr", 0x07>; +def NOPR_bare : InstAlias<"nopr", (NOPR R0D), 0>; // An alias of BRC 0, label def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>; @@ -464,6 +464,8 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in { def LLILF : UnaryRIL<"llilf", 0xC0F, bitconvert, GR64, imm64lf32>; def LLIHF : UnaryRIL<"llihf", 0xC0E, bitconvert, GR64, imm64hf32>; } +def LLGFI : InstAlias<"llgfi\t$R1, $RI1", (LLILF GR64:$R1, imm64lf32:$RI1)>; +def LLGHI : InstAlias<"llghi\t$R1, $RI1", (LLILL GR64:$R1, imm64ll16:$RI1)>; // Register loads. let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in { @@ -973,6 +975,7 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in { def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>; def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>; } +def LFI : InstAlias<"lfi\t$R1, $RI1", (IILF GR32:$R1, uimm32:$RI1)>; def IILF64 : BinaryAliasRIL; def IIHF64 : BinaryAliasRIL; @@ -1372,6 +1375,10 @@ let Predicates = [FeatureMiscellaneousExtensions3], let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in { def NORK : BinaryRRFa<"nork", 0xB976, nor, GR32, GR32, GR32>; def NOGRK : BinaryRRFa<"nogrk", 0xB966, nor, GR64, GR64, GR64>; + let isAsmParserOnly = 1 in { + def NOTR : UnaryRRFa<"notr", 0xB976, nor, GR32, GR32>; + def NOTGR : UnaryRRFa<"notgr", 0xB966, nor, GR64, GR64>; + } } // NXOR. @@ -1526,13 +1533,17 @@ def RLLG : BinaryRSY<"rllg", 0xEB1C, shiftop, GR64>; let Defs = [CC] in { let isCodeGenOnly = 1 in def RISBG32 : RotateSelectRIEf<"risbg", 0xEC55, GR32, GR32>; - let CCValues = 0xE, CompareZeroCCMask = 0xE in + let CCValues = 0xE, CompareZeroCCMask = 0xE in { def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>; + def RISBGZ : RotateSelectRIEf<"risbgz", 0xEC55, GR64, GR64, 0, 128>; + } } // On zEC12 we have a variant of RISBG that does not set CC. -let Predicates = [FeatureMiscellaneousExtensions] in +let Predicates = [FeatureMiscellaneousExtensions] in { def RISBGN : RotateSelectRIEf<"risbgn", 0xEC59, GR64, GR64>; + def RISBGNZ : RotateSelectRIEf<"risbgnz", 0xEC59, GR64, GR64, 0, 128>; +} // Forms of RISBG that only affect one word of the destination register. // They do not set CC. @@ -2330,6 +2341,8 @@ defm : BlockLoadStore; def JCT : MnemonicAlias<"jct", "brct">; def JCTG : MnemonicAlias<"jctg", "brctg">; +def JC : MnemonicAlias<"jc", "brc">; +def JCTH : MnemonicAlias<"jcth", "brcth">; def JAS : MnemonicAlias<"jas", "bras">; def JASL : MnemonicAlias<"jasl", "brasl">; def JXH : MnemonicAlias<"jxh", "brxh">; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td index 9ce1a0d06b5afd..d0fec02777875a 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -506,9 +506,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?(Z)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1553,5 +1553,11 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPI$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>; + } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td index 120d4a457ee396..a6d89ce9443c5a 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -516,9 +516,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?(Z)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1643,5 +1643,11 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>; + } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td index acba3a1fd9919e..455354e283ad8e 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -471,6 +471,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NOT(G)?R$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>; //===----------------------------------------------------------------------===// @@ -530,9 +531,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?(Z)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1689,5 +1690,10 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td index dd82b2b9b71e75..92abf0ba4022cc 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -471,6 +471,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "NC(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "OC(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NN(G)?RK$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NO(G)?RK$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "NOT(G)?R$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "NX(G)?RK$")>; //===----------------------------------------------------------------------===// @@ -530,9 +531,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXa4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXa, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBG(N|32)?(Z)?$")>; def : InstRW<[WLat1, FXa, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1722,5 +1723,10 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td index 226db9d4272f9b..99d0d674bbbb2f 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -469,9 +469,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(32)?$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(32)?(Z)?$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1235,5 +1235,10 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPI$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, LSU, EndGroup], (instregex "NOP(R)?$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td index f5ecdb1f438009..5b334da2bac342 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -480,9 +480,9 @@ def : InstRW<[WLat5LSU, WLat5LSU, FXU4, LSU, GroupAlone2], def : InstRW<[WLat2LSU, FXU, LSU, NormalGr], (instregex "RLL(G)?$")>; // Rotate and insert -def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(N|32)?$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBH(G|H|L)$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBG(N|32)?(Z)?$")>; def : InstRW<[WLat1, FXU, NormalGr], (instregex "RISBMux$")>; // Rotate and Select @@ -1280,5 +1280,10 @@ def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; def : InstRW<[WLat30, MCD], (instregex "TPI$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; +//===----------------------------------------------------------------------===// +// NOPs +//===----------------------------------------------------------------------===// + +def : InstRW<[WLat1, LSU, NormalGr], (instregex "NOP(R)?$")>; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 20e50c8c9e1ae0..e7810e18d44d42 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -21,7 +21,7 @@ #include "WebAssemblyRuntimeLibcallSignatures.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" using namespace llvm; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h index ff6515d5bf4e67..966b84aa4951be 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h @@ -16,7 +16,7 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/RuntimeLibcallUtil.h" namespace llvm { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index e49e96ceef6a4a..c7f88fed9b128b 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -58,6 +58,10 @@ static bool checkScale(unsigned Scale, StringRef &ErrMsg) { namespace { +// Including the generated SSE2AVX compression tables. +#define GET_X86_SSE2AVX_TABLE +#include "X86GenInstrMapping.inc" + static const char OpPrecedence[] = { 0, // IC_OR 1, // IC_XOR @@ -475,7 +479,9 @@ class X86AsmParser : public MCTargetAsmParser { unsigned getLength() const { return CurType.Length; } int64_t getImm() { return Imm + IC.execute(); } bool isValidEndState() const { - return State == IES_RBRAC || State == IES_INTEGER; + return State == IES_RBRAC || State == IES_RPAREN || + State == IES_INTEGER || State == IES_REGISTER || + State == IES_OFFSET; } // Is the intel expression appended after an operand index. @@ -1898,9 +1904,6 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::Error: return Error(getLexer().getErrLoc(), getLexer().getErr()); break; - case AsmToken::EndOfStatement: - Done = true; - break; case AsmToken::Real: // DotOperator: [ebx].0 UpdateLocLex = false; @@ -3745,7 +3748,27 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return false; } +static bool convertSSEToAVX(MCInst &Inst) { + ArrayRef Table{X86SSE2AVXTable}; + unsigned Opcode = Inst.getOpcode(); + const auto I = llvm::lower_bound(Table, Opcode); + if (I == Table.end() || I->OldOpc != Opcode) + return false; + + Inst.setOpcode(I->NewOpc); + // AVX variant of BLENDVPD/BLENDVPS/PBLENDVB instructions has more + // operand compare to SSE variant, which is added below + if (X86::isBLENDVPD(Opcode) || X86::isBLENDVPS(Opcode) || + X86::isPBLENDVB(Opcode)) + Inst.addOperand(Inst.getOperand(2)); + + return true; +} + bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { + if (MCOptions.X86Sse2Avx && convertSSEToAVX(Inst)) + return true; + if (ForcedOpcodePrefix != OpcodePrefix_VEX3 && X86::optimizeInstFromVEX3ToVEX2(Inst, MII.get(Inst.getOpcode()))) return true; diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8cb003b838d06b..9dafd5e628ca8f 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -86,14 +86,8 @@ def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", // The MMX subtarget feature is separate from the rest of the SSE features // because it's important (for odd compatibility reasons) to be able to // turn it off explicitly while allowing SSE+ to be on. -def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", +def FeatureMMX : SubtargetFeature<"mmx","HasMMX", "true", "Enable MMX instructions">; -def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", - "Enable 3DNow! instructions", - [FeatureMMX]>; -def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", - "Enable 3DNow! Athlon instructions", - [Feature3DNow]>; // All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied // feature, because SSE2 can be disabled (e.g. for compiling OS kernels) // without disabling 64-bit mode. Nothing should imply this feature bit. It @@ -1341,7 +1335,6 @@ def ProcessorFeatures { list BarcelonaFeatures = [FeatureX87, FeatureCX8, FeatureSSE4A, - Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCX16, @@ -1834,32 +1827,32 @@ def : ProcModel; -def : Proc<"k6-2", [FeatureX87, FeatureCX8, Feature3DNow], +def : Proc<"k6-2", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"k6-3", [FeatureX87, FeatureCX8, Feature3DNow], +def : Proc<"k6-3", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { def : Proc; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc; -def : Proc<"geode", [FeatureX87, FeatureCX8, Feature3DNowA], +def : Proc<"geode", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : Proc<"winchip-c6", [FeatureX87, FeatureMMX], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"winchip2", [FeatureX87, Feature3DNow], +def : Proc<"winchip2", [FeatureX87, FeatureMMX, FeaturePRFCHW], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; -def : Proc<"c3", [FeatureX87, Feature3DNow], +def : Proc<"c3", [FeatureX87, FeatureMMX, FeaturePRFCHW], [TuningSlowUAMem16, TuningInsertVZEROUPPER]>; def : Proc<"c3-2", [FeatureX87, FeatureCX8, FeatureMMX, FeatureSSE1, FeatureFXSR, FeatureCMOV], diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a731541ca7778e..9d651d4db67311 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -530,7 +530,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, VT, Custom); } - if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow()) + if (Subtarget.hasSSEPrefetch()) setOperationAction(ISD::PREFETCH , MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); @@ -5204,29 +5204,10 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, // Split the demanded elts of a HADD/HSUB node between its operands. static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { - int NumLanes = VT.getSizeInBits() / 128; - int NumElts = DemandedElts.getBitWidth(); - int NumEltsPerLane = NumElts / NumLanes; - int HalfEltsPerLane = NumEltsPerLane / 2; - - DemandedLHS = APInt::getZero(NumElts); - DemandedRHS = APInt::getZero(NumElts); - - // Map DemandedElts to the horizontal operands. - for (int Idx = 0; Idx != NumElts; ++Idx) { - if (!DemandedElts[Idx]) - continue; - int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; - int LocalIdx = Idx % NumEltsPerLane; - if (LocalIdx < HalfEltsPerLane) { - DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); - DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); - } else { - LocalIdx -= HalfEltsPerLane; - DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); - DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); - } - } + getHorizDemandedEltsForFirstOperand(VT.getSizeInBits(), DemandedElts, + DemandedLHS, DemandedRHS); + DemandedLHS |= DemandedLHS << 1; + DemandedRHS |= DemandedRHS << 1; } /// Calculates the shuffle mask corresponding to the target-specific opcode. @@ -37174,6 +37155,32 @@ static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, Known = KnownBits::sadd_sat(Lo, Hi); } +static KnownBits computeKnownBitsForHorizontalOperation( + const SDValue Op, const APInt &DemandedElts, unsigned Depth, + const SelectionDAG &DAG, + const function_ref + KnownBitsFunc) { + APInt DemandedEltsLHS, DemandedEltsRHS; + getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(), + DemandedElts, DemandedEltsLHS, + DemandedEltsRHS); + + const auto ComputeForSingleOpFunc = + [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) { + return KnownBitsFunc( + DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1), + DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1)); + }; + + if (DemandedEltsRHS.isZero()) + return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS); + if (DemandedEltsLHS.isZero()) + return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS); + + return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS) + .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS)); +} + void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, @@ -37503,6 +37510,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } + case X86ISD::HADD: + case X86ISD::HSUB: { + Known = computeKnownBitsForHorizontalOperation( + Op, DemandedElts, Depth, DAG, + [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) { + return KnownBits::computeForAddSub( + /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false, + KnownLHS, KnownRHS); + }); + break; + } case ISD::INTRINSIC_WO_CHAIN: { switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: @@ -45385,7 +45403,7 @@ static SDValue combineToExtendBoolVectorInReg( /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? static SDValue -combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, +combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); @@ -45393,7 +45411,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); - SDLoc DL(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() != ISD::VSELECT) @@ -45491,7 +45508,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, /// and concatenate the result to eliminate a wide (256-bit) vector instruction: /// vselect Cond, (concat T0, T1), (concat F0, F1) --> /// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) -static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, +static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) @@ -45515,15 +45532,15 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, ArrayRef Ops) { return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, - makeBlend, /*CheckBWI*/ false); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend, + /*CheckBWI*/ false); } -static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { +static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, + const SDLoc &DL) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); - SDLoc DL(N); auto *TrueC = dyn_cast(LHS); auto *FalseC = dyn_cast(RHS); @@ -45597,6 +45614,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { /// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, + const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); @@ -45681,8 +45699,8 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, // Otherwise we can still at least try to simplify multiple use bits. if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG)) - return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V, - N->getOperand(1), N->getOperand(2)); + return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V, + N->getOperand(1), N->getOperand(2)); return SDValue(); } @@ -45749,14 +45767,13 @@ static SDValue combineLogicBlendIntoConditionalNegate( return DAG.getBitcast(VT, Res); } -static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, + const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return SDValue(); if (N->getOpcode() != ISD::VSELECT) return SDValue(); - SDLoc DL(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -45798,7 +45815,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // folded with mask instruction, while the rhs operand can't. Commute the // lhs and rhs of the select instruction to create the opportunity of // folding. - if (SDValue V = commuteSelect(N, DAG, Subtarget)) + if (SDValue V = commuteSelect(N, DAG, DL, Subtarget)) return V; EVT VT = LHS.getValueType(); @@ -46080,7 +46097,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } - if (SDValue V = combineSelectOfTwoConstants(N, DAG)) + if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL)) return V; if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && @@ -46190,35 +46207,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // to bitwidth-1 for unsigned shifts, effectively performing a maximum left // shift of bitwidth-1 positions. and returns zero for unsigned right shifts // exceeding bitwidth-1. - if (N->getOpcode() == ISD::VSELECT && - (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) && - supportedVectorVarShift(VT, Subtarget, LHS.getOpcode())) { - APInt SV; + if (N->getOpcode() == ISD::VSELECT) { + using namespace llvm::SDPatternMatch; // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt) // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt) - if (Cond.getOpcode() == ISD::SETCC && - Cond.getOperand(0) == LHS.getOperand(1) && - cast(Cond.getOperand(2))->get() == ISD::SETULT && - ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) && + if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) && + supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) && ISD::isConstantSplatVectorAllZeros(RHS.getNode()) && - SV == VT.getScalarSizeInBits()) { + sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)), + m_SpecificInt(VT.getScalarSizeInBits()), + m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL, VT, LHS.getOperand(0), LHS.getOperand(1)); } + // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt) + // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt) + if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) && + supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) && + ISD::isConstantSplatVectorAllZeros(LHS.getNode()) && + sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)), + m_SpecificInt(VT.getScalarSizeInBits()), + m_SpecificCondCode(ISD::SETUGE)))) { + return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV + : X86ISD::VSHLV, + DL, VT, RHS.getOperand(0), RHS.getOperand(1)); + } } // Early exit check if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget)) return SDValue(); - if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) + if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget)) return V; - if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) + if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget)) return V; - if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) + if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget)) return V; // select(~Cond, X, Y) -> select(Cond, Y, X) @@ -48020,10 +48047,12 @@ static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + using namespace llvm::SDPatternMatch; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDLoc DL(N); // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts @@ -48033,21 +48062,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, SDValue Cond = N0.getOperand(0); SDValue N00 = N0.getOperand(1); SDValue N01 = N0.getOperand(2); - APInt SV; // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt) - if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 && - cast(Cond.getOperand(2))->get() == ISD::SETULT && - ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) && - ISD::isConstantSplatVectorAllZeros(N01.getNode()) && - SV == VT.getScalarSizeInBits()) { + if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) && + sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), + m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1); } // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt) - if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 && - cast(Cond.getOperand(2))->get() == ISD::SETUGE && - ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) && - ISD::isConstantSplatVectorAllZeros(N00.getNode()) && - SV == VT.getScalarSizeInBits()) { + if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) && + sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), + m_SpecificCondCode(ISD::SETUGE)))) { return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1); } } @@ -48160,9 +48184,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace llvm::SDPatternMatch; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDLoc DL(N); if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget)) @@ -48175,21 +48201,16 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, SDValue Cond = N0.getOperand(0); SDValue N00 = N0.getOperand(1); SDValue N01 = N0.getOperand(2); - APInt SV; // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt) - if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 && - cast(Cond.getOperand(2))->get() == ISD::SETULT && - ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) && - ISD::isConstantSplatVectorAllZeros(N01.getNode()) && - SV == VT.getScalarSizeInBits()) { + if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) && + sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), + m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1); } // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt) - if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 && - cast(Cond.getOperand(2))->get() == ISD::SETUGE && - ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) && - ISD::isConstantSplatVectorAllZeros(N00.getNode()) && - SV == VT.getScalarSizeInBits()) { + if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) && + sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), + m_SpecificCondCode(ISD::SETUGE)))) { return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1); } } diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td index 03612de0fad942..13fe7d2ccbe77a 100644 --- a/llvm/lib/Target/X86/X86Instr3DNow.td +++ b/llvm/lib/Target/X86/X86Instr3DNow.td @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// class I3DNow o, Format F, dag outs, dag ins, string asm, list pat> - : I, Requires<[Has3DNow]> { + : I { } class I3DNow_binop o, Format F, dag ins, string Mnemonic, list pat> @@ -25,66 +25,60 @@ class I3DNow_conv o, Format F, dag ins, string Mnemonic, list pat> : I3DNow, ThreeDNow; -multiclass I3DNow_binop_rm_int opc, string Mn, - X86FoldableSchedWrite sched, bit Commutable = 0, - string Ver = ""> { - let isCommutable = Commutable in - def rr : I3DNow_binop( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>, - Sched<[sched]>; - def rm : I3DNow_binop( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; +multiclass I3DNow_binop_rm opc, string Mn, + X86FoldableSchedWrite sched, bit Commutable = 0> { + let mayStore=0, hasSideEffects=0 in { + let isCommutable = Commutable, mayLoad=0 in + def rr : I3DNow_binop, Sched<[sched]>; + let mayLoad=1 in + def rm : I3DNow_binop, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -multiclass I3DNow_conv_rm_int opc, string Mn, - X86FoldableSchedWrite sched, string Ver = ""> { - def rr : I3DNow_conv( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>, - Sched<[sched]>; - def rm : I3DNow_conv( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) - (bitconvert (load_mmx addr:$src))))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; +multiclass I3DNow_conv_rm opc, string Mn, + X86FoldableSchedWrite sched> { + let mayStore=0, hasSideEffects=0 in { + let mayLoad=0 in + def rr : I3DNow_conv, Sched<[sched]>; + let mayLoad=1 in + def rm : I3DNow_conv, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>; -defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>; -defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>; -defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>; -defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>; -defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>; -defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>; -defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>; -defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>; -defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>; -defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>; -defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>; -defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>; -defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>; -defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>; -defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>; -defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>; -defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>; -defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>; +defm PAVGUSB : I3DNow_binop_rm<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>; +defm PF2ID : I3DNow_conv_rm<0x1D, "pf2id", WriteCvtPS2I>; +defm PFACC : I3DNow_binop_rm<0xAE, "pfacc", WriteFAdd>; +defm PFADD : I3DNow_binop_rm<0x9E, "pfadd", WriteFAdd, 1>; +defm PFCMPEQ : I3DNow_binop_rm<0xB0, "pfcmpeq", WriteFAdd, 1>; +defm PFCMPGE : I3DNow_binop_rm<0x90, "pfcmpge", WriteFAdd>; +defm PFCMPGT : I3DNow_binop_rm<0xA0, "pfcmpgt", WriteFAdd>; +defm PFMAX : I3DNow_binop_rm<0xA4, "pfmax", WriteFAdd>; +defm PFMIN : I3DNow_binop_rm<0x94, "pfmin", WriteFAdd>; +defm PFMUL : I3DNow_binop_rm<0xB4, "pfmul", WriteFAdd, 1>; +defm PFRCP : I3DNow_conv_rm<0x96, "pfrcp", WriteFAdd>; +defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1", WriteFAdd>; +defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2", WriteFAdd>; +defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1", WriteFAdd>; +defm PFRSQRT : I3DNow_conv_rm<0x97, "pfrsqrt", WriteFAdd>; +defm PFSUB : I3DNow_binop_rm<0x9A, "pfsub", WriteFAdd, 1>; +defm PFSUBR : I3DNow_binop_rm<0xAA, "pfsubr", WriteFAdd, 1>; +defm PI2FD : I3DNow_conv_rm<0x0D, "pi2fd", WriteCvtI2PS>; +defm PMULHRW : I3DNow_binop_rm<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>; -let SchedRW = [WriteEMMS], - Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in +let SchedRW = [WriteEMMS], mayLoad=1, mayStore=1, hasSideEffects=1 in def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", - [(int_x86_mmx_femms)]>, TB; + []>, TB; -let SchedRW = [WriteLoad] in { -let Predicates = [Has3DNow, NoSSEPrefetch] in +let SchedRW = [WriteLoad], mayLoad=1, mayStore=1, hasSideEffects=0 in { def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), "prefetch\t$addr", - [(prefetch addr:$addr, timm, timm, (i32 1))]>, TB; + []>, TB; +// Note: PREFETCHW is the only instruction in this file which is NOT specific to 3DNow! def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>, TB, Requires<[HasPrefetchW]>; @@ -94,8 +88,8 @@ def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr" } // "3DNowA" instructions -defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">; -defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">; -defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">; -defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">; -defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">; +defm PF2IW : I3DNow_conv_rm<0x1C, "pf2iw", WriteCvtPS2I>; +defm PI2FW : I3DNow_conv_rm<0x0C, "pi2fw", WriteCvtI2PS>; +defm PFNACC : I3DNow_binop_rm<0x8A, "pfnacc", WriteFAdd, 0>; +defm PFPNACC : I3DNow_binop_rm<0x8E, "pfpnacc", WriteFAdd, 0>; +defm PSWAPD : I3DNow_conv_rm<0xBB, "pswapd", SchedWriteShuffle.MMX>; diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 419ff9e6f5c0fb..f6038cf7a94cbd 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -50,8 +50,6 @@ def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; -def Has3DNow : Predicate<"Subtarget->hasThreeDNow()">; -def Has3DNowA : Predicate<"Subtarget->hasThreeDNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; @@ -141,7 +139,6 @@ def HasSGX : Predicate<"Subtarget->hasSGX()">; def HasSM3 : Predicate<"Subtarget->hasSM3()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; -def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasPREFETCHI : Predicate<"Subtarget->hasPREFETCHI()">; def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 6c9a94c9495908..4e8e04b1112c0c 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -290,8 +290,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, IsUnalignedMem16Slow = false; LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel - << ", 3DNowLevel " << X863DNowLevel << ", 64bit " - << HasX86_64 << "\n"); + << ", MMX " << HasMMX << ", 64bit " << HasX86_64 << "\n"); if (Is64Bit && !HasX86_64) report_fatal_error("64-bit code requested on a subtarget that doesn't " "support it!"); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 4532db134fcb42..e3cb9ee8ce1909 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -55,10 +55,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512 }; - enum X863DNowEnum { - NoThreeDNow, MMX, ThreeDNow, ThreeDNowA - }; - /// Which PIC style to use PICStyles::Style PICStyle; @@ -67,9 +63,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel = NoSSE; - /// MMX, 3DNow, 3DNow Athlon, or none supported. - X863DNowEnum X863DNowLevel = NoThreeDNow; - #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ bool ATTRIBUTE = DEFAULT; #include "X86GenSubtargetInfo.inc" @@ -207,21 +200,16 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasAVX2() const { return X86SSELevel >= AVX2; } bool hasAVX512() const { return X86SSELevel >= AVX512; } bool hasInt256() const { return hasAVX2(); } - bool hasMMX() const { return X863DNowLevel >= MMX; } - bool hasThreeDNow() const { return X863DNowLevel >= ThreeDNow; } - bool hasThreeDNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } bool hasPrefetchW() const { // The PREFETCHW instruction was added with 3DNow but later CPUs gave it - // its own CPUID bit as part of deprecating 3DNow. We assume the - // L1 version exists if the L2 version does. - return hasThreeDNow() || hasPRFCHW(); + // its own CPUID bit as part of deprecating 3DNow. + return hasPRFCHW(); } bool hasSSEPrefetch() const { - // We implicitly enable these when we have a write prefix supporting cache - // level OR if we have prfchw, but don't already have a read prefetch from - // 3dnow. - return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHI(); + // We also implicitly enable these when we have a write prefix supporting + // cache level OR if we have prfchw. + return hasSSE1() || hasPRFCHW() || hasPREFETCHI(); } bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); } // These are generic getters that OR together all of the thunk types diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index 27e198508c7921..491defb8676437 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -169,7 +169,7 @@ void XtensaInstrInfo::loadImmediate(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, get(Xtensa::MOVI), *Reg).addImm(Low); BuildMI(MBB, MBBI, DL, get(Xtensa::ADDMI), *Reg).addReg(*Reg).addImm(High); } else if (Value >= -4294967296LL && Value <= 4294967295LL) { - // 32 bit arbirary constant + // 32 bit arbitrary constant MachineConstantPool *MCP = MBB.getParent()->getConstantPool(); uint64_t UVal = ((uint64_t)Value) & 0xFFFFFFFFLL; const Constant *CVal = ConstantInt::get( diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 8d5ad91839bc40..82c1731f58f0ae 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -452,7 +452,7 @@ StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) { return StringSwitch(UArch) .Case("sifive,u74-mc", "sifive-u74") .Case("sifive,bullet0", "sifive-u74") - .Default("generic"); + .Default(""); } StringRef sys::detail::getHostCPUNameForBPF() { @@ -1573,8 +1573,10 @@ StringRef sys::getHostCPUName() { #if defined(__linux__) std::unique_ptr P = getProcCpuinfoContent(); StringRef Content = P ? P->getBuffer() : ""; - return detail::getHostCPUNameForRISCV(Content); -#else + StringRef Name = detail::getHostCPUNameForRISCV(Content); + if (!Name.empty()) + return Name; +#endif #if __riscv_xlen == 64 return "generic-rv64"; #elif __riscv_xlen == 32 @@ -1582,7 +1584,6 @@ StringRef sys::getHostCPUName() { #else #error "Unhandled value of __riscv_xlen" #endif -#endif } #elif defined(__sparc__) #if defined(__linux__) @@ -2006,6 +2007,76 @@ const StringMap sys::getHostCPUFeatures() { return Features; } +#elif defined(__linux__) && defined(__riscv) +// struct riscv_hwprobe +struct RISCVHwProbe { + int64_t Key; + uint64_t Value; +}; +const StringMap sys::getHostCPUFeatures() { + RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0}, + {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0}}; + int Ret = syscall(/*__NR_riscv_hwprobe=*/258, /*pairs=*/Query, + /*pair_count=*/std::size(Query), /*cpu_count=*/0, + /*cpus=*/0, /*flags=*/0); + if (Ret != 0) + return {}; + + StringMap Features; + uint64_t BaseMask = Query[0].Value; + // Check whether RISCV_HWPROBE_BASE_BEHAVIOR_IMA is set. + if (BaseMask & 1) { + Features["i"] = true; + Features["m"] = true; + Features["a"] = true; + } + + uint64_t ExtMask = Query[1].Value; + Features["f"] = ExtMask & (1 << 0); // RISCV_HWPROBE_IMA_FD + Features["d"] = ExtMask & (1 << 0); // RISCV_HWPROBE_IMA_FD + Features["c"] = ExtMask & (1 << 1); // RISCV_HWPROBE_IMA_C + Features["v"] = ExtMask & (1 << 2); // RISCV_HWPROBE_IMA_V + Features["zba"] = ExtMask & (1 << 3); // RISCV_HWPROBE_EXT_ZBA + Features["zbb"] = ExtMask & (1 << 4); // RISCV_HWPROBE_EXT_ZBB + Features["zbs"] = ExtMask & (1 << 5); // RISCV_HWPROBE_EXT_ZBS + Features["zicboz"] = ExtMask & (1 << 6); // RISCV_HWPROBE_EXT_ZICBOZ + Features["zbc"] = ExtMask & (1 << 7); // RISCV_HWPROBE_EXT_ZBC + Features["zbkb"] = ExtMask & (1 << 8); // RISCV_HWPROBE_EXT_ZBKB + Features["zbkc"] = ExtMask & (1 << 9); // RISCV_HWPROBE_EXT_ZBKC + Features["zbkx"] = ExtMask & (1 << 10); // RISCV_HWPROBE_EXT_ZBKX + Features["zknd"] = ExtMask & (1 << 11); // RISCV_HWPROBE_EXT_ZKND + Features["zkne"] = ExtMask & (1 << 12); // RISCV_HWPROBE_EXT_ZKNE + Features["zknh"] = ExtMask & (1 << 13); // RISCV_HWPROBE_EXT_ZKNH + Features["zksed"] = ExtMask & (1 << 14); // RISCV_HWPROBE_EXT_ZKSED + Features["zksh"] = ExtMask & (1 << 15); // RISCV_HWPROBE_EXT_ZKSH + Features["zkt"] = ExtMask & (1 << 16); // RISCV_HWPROBE_EXT_ZKT + Features["zvbb"] = ExtMask & (1 << 17); // RISCV_HWPROBE_EXT_ZVBB + Features["zvbc"] = ExtMask & (1 << 18); // RISCV_HWPROBE_EXT_ZVBC + Features["zvkb"] = ExtMask & (1 << 19); // RISCV_HWPROBE_EXT_ZVKB + Features["zvkg"] = ExtMask & (1 << 20); // RISCV_HWPROBE_EXT_ZVKG + Features["zvkned"] = ExtMask & (1 << 21); // RISCV_HWPROBE_EXT_ZVKNED + Features["zvknha"] = ExtMask & (1 << 22); // RISCV_HWPROBE_EXT_ZVKNHA + Features["zvknhb"] = ExtMask & (1 << 23); // RISCV_HWPROBE_EXT_ZVKNHB + Features["zvksed"] = ExtMask & (1 << 24); // RISCV_HWPROBE_EXT_ZVKSED + Features["zvksh"] = ExtMask & (1 << 25); // RISCV_HWPROBE_EXT_ZVKSH + Features["zvkt"] = ExtMask & (1 << 26); // RISCV_HWPROBE_EXT_ZVKT + Features["zfh"] = ExtMask & (1 << 27); // RISCV_HWPROBE_EXT_ZFH + Features["zfhmin"] = ExtMask & (1 << 28); // RISCV_HWPROBE_EXT_ZFHMIN + Features["zihintntl"] = ExtMask & (1 << 29); // RISCV_HWPROBE_EXT_ZIHINTNTL + Features["zvfh"] = ExtMask & (1 << 30); // RISCV_HWPROBE_EXT_ZVFH + Features["zvfhmin"] = ExtMask & (1ULL << 31); // RISCV_HWPROBE_EXT_ZVFHMIN + Features["zfa"] = ExtMask & (1ULL << 32); // RISCV_HWPROBE_EXT_ZFA + Features["ztso"] = ExtMask & (1ULL << 33); // RISCV_HWPROBE_EXT_ZTSO + Features["zacas"] = ExtMask & (1ULL << 34); // RISCV_HWPROBE_EXT_ZACAS + Features["zicond"] = ExtMask & (1ULL << 35); // RISCV_HWPROBE_EXT_ZICOND + Features["zihintpause"] = + ExtMask & (1ULL << 36); // RISCV_HWPROBE_EXT_ZIHINTPAUSE + + // TODO: set unaligned-scalar-mem if RISCV_HWPROBE_KEY_MISALIGNED_PERF returns + // RISCV_HWPROBE_MISALIGNED_FAST. + + return Features; +} #else const StringMap sys::getHostCPUFeatures() { return {}; } #endif diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp index 9003f9beffa7e7..db1b5f689d7daf 100644 --- a/llvm/lib/TargetParser/RISCVTargetParser.cpp +++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp @@ -21,7 +21,9 @@ namespace llvm { namespace RISCV { enum CPUKind : unsigned { -#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGN) CK_##ENUM, +#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_SCALAR_UNALIGN, \ + FAST_VECTOR_UNALIGN) \ + CK_##ENUM, #define TUNE_PROC(ENUM, NAME) CK_##ENUM, #include "llvm/TargetParser/RISCVTargetParserDef.inc" }; @@ -29,13 +31,15 @@ enum CPUKind : unsigned { struct CPUInfo { StringLiteral Name; StringLiteral DefaultMarch; - bool FastUnalignedAccess; + bool FastScalarUnalignedAccess; + bool FastVectorUnalignedAccess; bool is64Bit() const { return DefaultMarch.starts_with("rv64"); } }; constexpr CPUInfo RISCVCPUInfo[] = { -#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGN) \ - {NAME, DEFAULT_MARCH, FAST_UNALIGN}, +#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_SCALAR_UNALIGN, \ + FAST_VECTOR_UNALIGN) \ + {NAME, DEFAULT_MARCH, FAST_SCALAR_UNALIGN, FAST_VECTOR_UNALIGN}, #include "llvm/TargetParser/RISCVTargetParserDef.inc" }; @@ -46,9 +50,14 @@ static const CPUInfo *getCPUInfoByName(StringRef CPU) { return nullptr; } -bool hasFastUnalignedAccess(StringRef CPU) { +bool hasFastScalarUnalignedAccess(StringRef CPU) { const CPUInfo *Info = getCPUInfoByName(CPU); - return Info && Info->FastUnalignedAccess; + return Info && Info->FastScalarUnalignedAccess; +} + +bool hasFastVectorUnalignedAccess(StringRef CPU) { + const CPUInfo *Info = getCPUInfoByName(CPU); + return Info && Info->FastVectorUnalignedAccess; } bool parseCPU(StringRef CPU, bool IsRV64) { diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index a54a02ac61d681..4fc1ff5aaa051f 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -44,6 +44,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) { case hsail: return "hsail"; case kalimba: return "kalimba"; case lanai: return "lanai"; + case le32: return "le32"; + case le64: return "le64"; case loongarch32: return "loongarch32"; case loongarch64: return "loongarch64"; case m68k: return "m68k"; @@ -197,6 +199,9 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) { case nvptx: return "nvvm"; case nvptx64: return "nvvm"; + case le32: return "le32"; + case le64: return "le64"; + case amdil: case amdil64: return "amdil"; @@ -427,6 +432,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { .Case("xcore", xcore) .Case("nvptx", nvptx) .Case("nvptx64", nvptx64) + .Case("le32", le32) + .Case("le64", le64) .Case("amdil", amdil) .Case("amdil64", amdil64) .Case("hsail", hsail) @@ -567,6 +574,8 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("xcore", Triple::xcore) .Case("nvptx", Triple::nvptx) .Case("nvptx64", Triple::nvptx64) + .Case("le32", Triple::le32) + .Case("le64", Triple::le64) .Case("amdil", Triple::amdil) .Case("amdil64", Triple::amdil64) .Case("hsail", Triple::hsail) @@ -896,6 +905,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::hsail: case Triple::kalimba: case Triple::lanai: + case Triple::le32: + case Triple::le64: case Triple::loongarch32: case Triple::loongarch64: case Triple::m68k: @@ -1592,6 +1603,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::hsail: case llvm::Triple::kalimba: case llvm::Triple::lanai: + case llvm::Triple::le32: case llvm::Triple::loongarch32: case llvm::Triple::m68k: case llvm::Triple::mips: @@ -1624,6 +1636,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::bpfeb: case llvm::Triple::bpfel: case llvm::Triple::hsail64: + case llvm::Triple::le64: case llvm::Triple::loongarch64: case llvm::Triple::mips64: case llvm::Triple::mips64el: @@ -1682,6 +1695,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::hsail: case Triple::kalimba: case Triple::lanai: + case Triple::le32: case Triple::loongarch32: case Triple::m68k: case Triple::mips: @@ -1712,6 +1726,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::aarch64_be: T.setArch(Triple::armeb); break; case Triple::amdil64: T.setArch(Triple::amdil); break; case Triple::hsail64: T.setArch(Triple::hsail); break; + case Triple::le64: T.setArch(Triple::le32); break; case Triple::loongarch64: T.setArch(Triple::loongarch32); break; case Triple::mips64: T.setArch(Triple::mips, getSubArch()); @@ -1766,6 +1781,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::bpfeb: case Triple::bpfel: case Triple::hsail64: + case Triple::le64: case Triple::loongarch64: case Triple::mips64: case Triple::mips64el: @@ -1789,6 +1805,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::arm: T.setArch(Triple::aarch64); break; case Triple::armeb: T.setArch(Triple::aarch64_be); break; case Triple::hsail: T.setArch(Triple::hsail64); break; + case Triple::le32: T.setArch(Triple::le64); break; case Triple::loongarch32: T.setArch(Triple::loongarch64); break; case Triple::mips: T.setArch(Triple::mips64, getSubArch()); @@ -1831,6 +1848,8 @@ Triple Triple::getBigEndianArchVariant() const { case Triple::hsail64: case Triple::hsail: case Triple::kalimba: + case Triple::le32: + case Triple::le64: case Triple::loongarch32: case Triple::loongarch64: case Triple::msp430: @@ -1934,6 +1953,8 @@ bool Triple::isLittleEndian() const { case Triple::hsail64: case Triple::hsail: case Triple::kalimba: + case Triple::le32: + case Triple::le64: case Triple::loongarch32: case Triple::loongarch64: case Triple::mips64el: diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 1e0b8d448b9d14..d5a38ec17a2a84 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -54,6 +54,11 @@ static cl::opt StrNCmpInlineThreshold( cl::desc("The maximum length of a constant string for a builtin string cmp " "call eligible for inlining. The default value is 3.")); +static cl::opt + MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, + cl::desc("The maximum length of a constant string to " + "inline a memchr call.")); + /// Match a pattern for a bitwise funnel/rotate operation that partially guards /// against undefined behavior by branching around the funnel-shift/rotation /// when the shift amount is 0. @@ -1103,6 +1108,81 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N, } } +/// Convert memchr with a small constant string into a switch +static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, + const DataLayout &DL) { + if (isa(Call->getArgOperand(1))) + return false; + + StringRef Str; + Value *Base = Call->getArgOperand(0); + if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false)) + return false; + + uint64_t N = Str.size(); + if (auto *ConstInt = dyn_cast(Call->getArgOperand(2))) { + uint64_t Val = ConstInt->getZExtValue(); + // Ignore the case that n is larger than the size of string. + if (Val > N) + return false; + N = Val; + } else + return false; + + if (N > MemChrInlineThreshold) + return false; + + BasicBlock *BB = Call->getParent(); + BasicBlock *BBNext = SplitBlock(BB, Call, DTU); + IRBuilder<> IRB(BB); + IntegerType *ByteTy = IRB.getInt8Ty(); + BB->getTerminator()->eraseFromParent(); + SwitchInst *SI = IRB.CreateSwitch( + IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); + Type *IndexTy = DL.getIndexType(Call->getType()); + SmallVector Updates; + + BasicBlock *BBSuccess = BasicBlock::Create( + Call->getContext(), "memchr.success", BB->getParent(), BBNext); + IRB.SetInsertPoint(BBSuccess); + PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx"); + Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI); + IRB.CreateBr(BBNext); + if (DTU) + Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext}); + + SmallPtrSet Cases; + for (uint64_t I = 0; I < N; ++I) { + ConstantInt *CaseVal = ConstantInt::get(ByteTy, Str[I]); + if (!Cases.insert(CaseVal).second) + continue; + + BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case", + BB->getParent(), BBSuccess); + SI->addCase(CaseVal, BBCase); + IRB.SetInsertPoint(BBCase); + IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase); + IRB.CreateBr(BBSuccess); + if (DTU) { + Updates.push_back({DominatorTree::Insert, BB, BBCase}); + Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess}); + } + } + + PHINode *PHI = + PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin()); + PHI->addIncoming(Constant::getNullValue(Call->getType()), BB); + PHI->addIncoming(FirstOccursLocation, BBSuccess); + + Call->replaceAllUsesWith(PHI); + Call->eraseFromParent(); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, const DataLayout &DL, @@ -1135,6 +1215,12 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, return true; } break; + case LibFunc_memchr: + if (foldMemChr(CI, &DTU, DL)) { + MadeCFGChange = true; + return true; + } + break; default:; } return false; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 21d5e1dece0246..1661fa564c65c7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -394,9 +394,14 @@ void PointerReplacer::replace(Instruction *I) { NewI->setNoWrapFlags(GEP->getNoWrapFlags()); WorkMap[GEP] = NewI; } else if (auto *SI = dyn_cast(I)) { - auto *NewSI = SelectInst::Create( - SI->getCondition(), getReplacement(SI->getTrueValue()), - getReplacement(SI->getFalseValue()), SI->getName(), nullptr, SI); + Value *TrueValue = SI->getTrueValue(); + Value *FalseValue = SI->getFalseValue(); + if (Value *Replacement = getReplacement(TrueValue)) + TrueValue = Replacement; + if (Value *Replacement = getReplacement(FalseValue)) + FalseValue = Replacement; + auto *NewSI = SelectInst::Create(SI->getCondition(), TrueValue, FalseValue, + SI->getName(), nullptr, SI); IC.InsertNewInstWith(NewSI, SI->getIterator()); NewSI->takeName(SI); WorkMap[SI] = NewSI; diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 635bd1236196e5..0ee1afa76a8234 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -231,12 +231,19 @@ class LoopIdiomRecognize { bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); + bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX, + bool ZeroCheck, size_t CanonicalSize); + bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX, + Instruction *DefX, PHINode *CntPhi, + Instruction *CntInst); bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz + bool recognizeShiftUntilLessThan(); void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, - bool IsCntPhiUsedOutsideLoop); + bool IsCntPhiUsedOutsideLoop, + bool InsertSub = false); bool recognizeShiftUntilBitTest(); bool recognizeShiftUntilZero(); @@ -1482,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << CurLoop->getHeader()->getName() << "\n"); return recognizePopcount() || recognizeAndInsertFFS() || - recognizeShiftUntilBitTest() || recognizeShiftUntilZero(); + recognizeShiftUntilBitTest() || recognizeShiftUntilZero() || + recognizeShiftUntilLessThan(); } /// Check if the given conditional branch is based on the comparison between @@ -1517,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, return nullptr; } +/// Check if the given conditional branch is based on an unsigned less-than +/// comparison between a variable and a constant, and if the comparison is false +/// the control yields to the loop entry. If the branch matches the behaviour, +/// the variable involved in the comparison is returned. +static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, + APInt &Threshold) { + if (!BI || !BI->isConditional()) + return nullptr; + + ICmpInst *Cond = dyn_cast(BI->getCondition()); + if (!Cond) + return nullptr; + + ConstantInt *CmpConst = dyn_cast(Cond->getOperand(1)); + if (!CmpConst) + return nullptr; + + BasicBlock *FalseSucc = BI->getSuccessor(1); + ICmpInst::Predicate Pred = Cond->getPredicate(); + + if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) { + Threshold = CmpConst->getValue(); + return Cond->getOperand(0); + } + + return nullptr; +} + // Check if the recurrence variable `VarX` is in the right form to create // the idiom. Returns the value coerced to a PHINode if so. static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, @@ -1528,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, return nullptr; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +/// or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +/// or nullptr if there is no such. +/// 3) \p InitX is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// 5) \p Threshold is set to the constant involved in the unsigned less-than +/// comparison. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 < 2) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val +/// do { +/// x = phi (x0, x.next); //PhiX +/// cnt = phi (cnt0, cnt.next) +/// +/// cnt.next = cnt + 1; +/// ... +/// x.next = x >> 1; // DefX +/// } while (x >= 4) +/// loop-exit: +/// \endcode +static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, + Intrinsic::ID &IntrinID, + Value *&InitX, Instruction *&CntInst, + PHINode *&CntPhi, Instruction *&DefX, + APInt &Threshold) { + BasicBlock *LoopEntry; + + DefX = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + if (Value *T = matchShiftULTCondition( + dyn_cast(LoopEntry->getTerminator()), LoopEntry, + Threshold)) + DefX = dyn_cast(T); + else + return false; + + // step 2: Check the recurrence of variable X + if (!DefX || !isa(DefX)) + return false; + + PHINode *VarPhi = cast(DefX); + int Idx = VarPhi->getBasicBlockIndex(LoopEntry); + if (Idx == -1) + return false; + + DefX = dyn_cast(VarPhi->getIncomingValue(Idx)); + if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi) + return false; + + // step 3: detect instructions corresponding to "x.next = x >> 1" + if (DefX->getOpcode() != Instruction::LShr) + return false; + + IntrinID = Intrinsic::ctlz; + ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); + if (!Shft || !Shft->isOne()) + return false; + + InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader()); + + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 + // or cnt.next = cnt + -1. + // TODO: We can skip the step. If loop trip count is known (CTLZ), + // then all uses of "cnt.next" could be optimized to the trip count + // plus "cnt0". Currently it is not optimized. + // This step could be used to detect POPCNT instruction: + // cnt.next = cnt + (x.next & 1) + for (Instruction &Inst : llvm::make_range( + LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { + if (Inst.getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast(Inst.getOperand(1)); + if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) + continue; + + PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); + if (!Phi) + continue; + + CntInst = &Inst; + CntPhi = Phi; + break; + } + if (!CntInst) + return false; + + return true; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1756,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, return true; } -/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop -/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new -/// trip count returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeAndInsertFFS() { - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) - return false; +// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always +// profitable if we delete the loop. +bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID, + Value *InitX, bool ZeroCheck, + size_t CanonicalSize) { + const Value *Args[] = {InitX, + ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; - Intrinsic::ID IntrinID; - Value *InitX; - Instruction *DefX = nullptr; - PHINode *CntPhi = nullptr; - Instruction *CntInst = nullptr; - // Help decide if transformation is profitable. For ShiftUntilZero idiom, - // this is always 6. - size_t IdiomCanonicalSize = 6; + // @llvm.dbg doesn't count as they have no semantic effect. + auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); + uint32_t HeaderSize = + std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); - if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, - CntInst, CntPhi, DefX)) + IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); + InstructionCost Cost = TTI->getIntrinsicInstrCost( + Attrs, TargetTransformInfo::TCK_SizeAndLatency); + if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic) return false; + return true; +} + +/// Convert CTLZ / CTTZ idiom loop into countable loop. +/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise, +/// returns false. +bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID, + Value *InitX, Instruction *DefX, + PHINode *CntPhi, + Instruction *CntInst) { bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) if (!CurLoop->contains(cast(U))) { @@ -1818,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() { ZeroCheck = true; } - // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always - // profitable if we delete the loop. - - // the loop has only 6 instructions: + // FFS idiom loop has only 6 instructions: // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] // %shr = ashr %n.addr.0, 1 // %tobool = icmp eq %shr, 0 // %inc = add nsw %i.0, 1 // br i1 %tobool + size_t IdiomCanonicalSize = 6; + if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) + return false; - const Value *Args[] = {InitX, - ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; + transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, + DefX->getDebugLoc(), ZeroCheck, + IsCntPhiUsedOutsideLoop); + return true; +} - // @llvm.dbg doesn't count as they have no semantic effect. - auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); - uint32_t HeaderSize = - std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); +/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new +/// trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertFFS() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; - IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); - InstructionCost Cost = - TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency); - if (HeaderSize != IdiomCanonicalSize && - Cost > TargetTransformInfo::TCC_Basic) + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + + if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi, + DefX)) + return false; + + return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); +} + +bool LoopIdiomRecognize::recognizeShiftUntilLessThan() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + + APInt LoopThreshold; + if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, + CntPhi, DefX, LoopThreshold)) + return false; + + if (LoopThreshold == 2) { + // Treat as regular FFS. + return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); + } + + // Look for Floor Log2 Idiom. + if (LoopThreshold != 4) + return false; + + // Abort if CntPhi is used outside of the loop. + for (User *U : CntPhi->users()) + if (!CurLoop->contains(cast(U))) + return false; + + // It is safe to assume Preheader exist as it was checked in + // parent function RunOnLoop. + BasicBlock *PH = CurLoop->getLoopPreheader(); + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast(PreCondBB->getTerminator()); + if (!PreCondBI) + return false; + + APInt PreLoopThreshold; + if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX || + PreLoopThreshold != 2) return false; + bool ZeroCheck = true; + + // the loop has only 6 instructions: + // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] + // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] + // %shr = ashr %n.addr.0, 1 + // %tobool = icmp ult %n.addr.0, C + // %inc = add nsw %i.0, 1 + // br i1 %tobool + size_t IdiomCanonicalSize = 6; + if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) + return false; + + // log2(x) = w − 1 − clz(x) transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, DefX->getDebugLoc(), ZeroCheck, - IsCntPhiUsedOutsideLoop); + /*IsCntPhiUsedOutsideLoop=*/false, + /*InsertSub=*/true); return true; } @@ -1961,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, void LoopIdiomRecognize::transformLoopToCountable( Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) { BranchInst *PreheaderBr = cast(Preheader->getTerminator()); // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block @@ -1991,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable( Type *CountTy = Count->getType(); Count = Builder.CreateSub( ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count); + if (InsertSub) + Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1)); Value *NewCount = Count; if (IsCntPhiUsedOutsideLoop) Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1)); diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 072859af4c5f98..acb79e94d087c5 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -64,10 +64,11 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, // Vectorization requires loop-rotation. Use default threshold for loops the // user explicitly marked for vectorization, even when header duplication is // disabled. - int Threshold = EnableHeaderDuplication || - hasVectorizeTransformation(&L) == TM_ForcedByUser - ? DefaultRotationThreshold - : 0; + int Threshold = + (EnableHeaderDuplication && !L.getHeader()->getParent()->hasMinSize()) || + hasVectorizeTransformation(&L) == TM_ForcedByUser + ? DefaultRotationThreshold + : 0; const DataLayout &DL = L.getHeader()->getDataLayout(); const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1c4a0d92dcde94..11f9f7822a15c8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2404,6 +2404,7 @@ void LSRInstance::OptimizeShadowIV() { /* Add new PHINode. */ PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator()); + NewPH->setDebugLoc(PH->getDebugLoc()); /* create new increment. '++d' in above example. */ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); @@ -2411,6 +2412,7 @@ void LSRInstance::OptimizeShadowIV() { Incr->getOpcode() == Instruction::Add ? Instruction::FAdd : Instruction::FSub, NewPH, CFP, "IV.S.next.", Incr->getIterator()); + NewIncr->setDebugLoc(Incr->getDebugLoc()); NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); @@ -5953,8 +5955,8 @@ void LSRInstance::RewriteForPHI( // formulae will not be implemented completely and some instructions // will not be eliminated. if (needUpdateFixups) { - for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) - for (LSRFixup &Fixup : Uses[LUIdx].Fixups) + for (LSRUse &LU : Uses) + for (LSRFixup &Fixup : LU.Fixups) // If fixup is supposed to rewrite some operand in the phi // that was just updated, it may be already moved to // another phi node. Such fixup requires update. diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 4063762c88a2e4..cee34f0a6da1f3 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1296,6 +1296,15 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, if (!BAA.isMustAlias(MemSet->getDest(), MemCpy->getDest())) return false; + // Don't perform the transform if src_size may be zero. In that case, the + // transform is essentially a complex no-op and may lead to an infinite + // loop if BasicAA is smart enough to understand that dst and dst + src_size + // are still MustAlias after the transform. + Value *SrcSize = MemCpy->getLength(); + if (!isKnownNonZero(SrcSize, + SimplifyQuery(MemCpy->getDataLayout(), DT, AC, MemCpy))) + return false; + // Check that src and dst of the memcpy aren't the same. While memcpy // operands cannot partially overlap, exact equality is allowed. if (isModSet(BAA.getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy)))) @@ -1312,7 +1321,6 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, // Use the same i8* dest as the memcpy, killing the memset dest if different. Value *Dest = MemCpy->getRawDest(); Value *DestSize = MemSet->getLength(); - Value *SrcSize = MemCpy->getLength(); if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy)) return false; @@ -1726,8 +1734,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { return true; } - // If the size is zero, remove the memcpy. This also prevents infinite loops - // in processMemSetMemCpyDependence, which is a no-op for zero-length memcpys. + // If the size is zero, remove the memcpy. if (isZeroSize(M->getLength())) { ++BBI; eraseInstruction(M); diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index f7268e8b17d2f7..e742d2ed12af1a 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -596,8 +596,8 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, /// of leaf nodes as inner nodes cannot occur by remembering all of the future /// leaves and refusing to reuse any of them as inner nodes. SmallPtrSet NotRewritable; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - NotRewritable.insert(Ops[i].Op); + for (const ValueEntry &Op : Ops) + NotRewritable.insert(Op.Op); // ExpressionChangedStart - Non-null if the rewritten expression differs from // the original in some non-trivial way, requiring the clearing of optional @@ -762,8 +762,8 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, } // Throw away any left over nodes from the original expression. - for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) - RedoInsts.insert(NodesToRewrite[i]); + for (BinaryOperator *BO : NodesToRewrite) + RedoInsts.insert(BO); } /// Insert instructions before the instruction pointed to by BI, @@ -1988,8 +1988,8 @@ void ReassociatePass::EraseInst(Instruction *I) { I->eraseFromParent(); // Optimize its operands. SmallPtrSet Visited; // Detect self-referential nodes. - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (Instruction *Op = dyn_cast(Ops[i])) { + for (Value *V : Ops) + if (Instruction *Op = dyn_cast(V)) { // If this is a node in an expression tree, climb to the expression root // and add that since that's where optimization actually happens. unsigned Opcode = Op->getOpcode(); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index fdb3211b4a438e..6a3b0e1b43e219 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -1245,9 +1245,12 @@ static BasicBlock *buildClonedLoopBlocks( if (SE && isa(I)) SE->forgetValue(&I); + BasicBlock::iterator InsertPt = MergeBB->getFirstInsertionPt(); + auto *MergePN = PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi"); - MergePN->insertBefore(MergeBB->getFirstInsertionPt()); + MergePN->insertBefore(InsertPt); + MergePN->setDebugLoc(InsertPt->getDebugLoc()); I.replaceAllUsesWith(MergePN); MergePN->addIncoming(&I, ExitBB); MergePN->addIncoming(&ClonedI, ClonedExitBB); @@ -1306,8 +1309,9 @@ static BasicBlock *buildClonedLoopBlocks( else if (auto *SI = dyn_cast(ClonedTerminator)) ClonedConditionToErase = SI->getCondition(); + Instruction *BI = BranchInst::Create(ClonedSuccBB, ClonedParentBB); + BI->setDebugLoc(ClonedTerminator->getDebugLoc()); ClonedTerminator->eraseFromParent(); - BranchInst::Create(ClonedSuccBB, ClonedParentBB); if (ClonedConditionToErase) RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr, @@ -2334,22 +2338,27 @@ static void unswitchNontrivialInvariants( // nuke the initial terminator placed in the split block. SplitBB->getTerminator()->eraseFromParent(); if (FullUnswitch) { - // Splice the terminator from the original loop and rewrite its - // successors. - TI.moveBefore(*SplitBB, SplitBB->end()); - // Keep a clone of the terminator for MSSA updates. Instruction *NewTI = TI.clone(); NewTI->insertInto(ParentBB, ParentBB->end()); + // Splice the terminator from the original loop and rewrite its + // successors. + TI.moveBefore(*SplitBB, SplitBB->end()); + TI.dropLocation(); + // First wire up the moved terminator to the preheaders. if (BI) { BasicBlock *ClonedPH = ClonedPHs.begin()->second; BI->setSuccessor(ClonedSucc, ClonedPH); BI->setSuccessor(1 - ClonedSucc, LoopPH); Value *Cond = skipTrivialSelect(BI->getCondition()); - if (InsertFreeze) + if (InsertFreeze) { + // We don't give any debug location to the new freeze, because the + // BI (`dyn_cast(TI)`) is an in-loop instruction hoisted + // out of the loop. Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator()); + } BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); } else { @@ -2432,12 +2441,13 @@ static void unswitchNontrivialInvariants( DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB}); } - // After MSSAU update, remove the cloned terminator instruction NewTI. - ParentBB->getTerminator()->eraseFromParent(); - // Create a new unconditional branch to the continuing block (as opposed to // the one cloned). - BranchInst::Create(RetainedSuccBB, ParentBB); + Instruction *NewBI = BranchInst::Create(RetainedSuccBB, ParentBB); + NewBI->setDebugLoc(NewTI->getDebugLoc()); + + // After MSSAU update, remove the cloned terminator instruction NewTI. + NewTI->eraseFromParent(); } else { assert(BI && "Only branches have partial unswitching."); assert(UnswitchedSuccBBs.size() == 1 && @@ -2710,6 +2720,7 @@ static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, PHINode::Create(SI->getType(), 2, "unswitched.select", SI->getIterator()); Phi->addIncoming(SI->getTrueValue(), ThenBB); Phi->addIncoming(SI->getFalseValue(), HeadBB); + Phi->setDebugLoc(SI->getDebugLoc()); SI->replaceAllUsesWith(Phi); SI->eraseFromParent(); diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 51e8821773c3af..1b811c7cebef9b 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_component_library(LLVMTransformUtils CountVisits.cpp Debugify.cpp DemoteRegToStack.cpp + DXILResource.cpp DXILUpgrade.cpp EntryExitInstrumenter.cpp EscapeEnumerator.cpp diff --git a/llvm/lib/Transforms/Utils/DXILResource.cpp b/llvm/lib/Transforms/Utils/DXILResource.cpp new file mode 100644 index 00000000000000..7281c7ad045312 --- /dev/null +++ b/llvm/lib/Transforms/Utils/DXILResource.cpp @@ -0,0 +1,369 @@ +//===- DXILResource.cpp - Tools to translate DXIL resources ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/DXILResource.h" +#include "llvm/ADT/APInt.h" +#include "llvm/IR/DerivedTypes.h" + +using namespace llvm; +using namespace dxil; + +bool ResourceInfo::isUAV() const { return RC == ResourceClass::UAV; } + +bool ResourceInfo::isCBuffer() const { return RC == ResourceClass::CBuffer; } + +bool ResourceInfo::isSampler() const { return RC == ResourceClass::Sampler; } + +bool ResourceInfo::isStruct() const { + return Kind == ResourceKind::StructuredBuffer; +} + +bool ResourceInfo::isTyped() const { + switch (Kind) { + case ResourceKind::Texture1D: + case ResourceKind::Texture2D: + case ResourceKind::Texture2DMS: + case ResourceKind::Texture3D: + case ResourceKind::TextureCube: + case ResourceKind::Texture1DArray: + case ResourceKind::Texture2DArray: + case ResourceKind::Texture2DMSArray: + case ResourceKind::TextureCubeArray: + case ResourceKind::TypedBuffer: + return true; + case ResourceKind::RawBuffer: + case ResourceKind::StructuredBuffer: + case ResourceKind::FeedbackTexture2D: + case ResourceKind::FeedbackTexture2DArray: + case ResourceKind::CBuffer: + case ResourceKind::Sampler: + case ResourceKind::TBuffer: + case ResourceKind::RTAccelerationStructure: + return false; + case ResourceKind::Invalid: + case ResourceKind::NumEntries: + llvm_unreachable("Invalid resource kind"); + } +} + +bool ResourceInfo::isFeedback() const { + return Kind == ResourceKind::FeedbackTexture2D || + Kind == ResourceKind::FeedbackTexture2DArray; +} + +bool ResourceInfo::isMultiSample() const { + return Kind == ResourceKind::Texture2DMS || + Kind == ResourceKind::Texture2DMSArray; +} + +ResourceInfo ResourceInfo::SRV(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + ElementType ElementTy, uint32_t ElementCount, + ResourceKind Kind) { + ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name, Binding, UniqueID); + assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) && + "Invalid ResourceKind for SRV constructor."); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + return RI; +} + +ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID) { + ResourceInfo RI(ResourceClass::SRV, ResourceKind::RawBuffer, Symbol, Name, + Binding, UniqueID); + return RI; +} + +ResourceInfo ResourceInfo::StructuredBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, uint32_t Stride, + Align Alignment) { + ResourceInfo RI(ResourceClass::SRV, ResourceKind::StructuredBuffer, Symbol, + Name, Binding, UniqueID); + RI.Struct.Stride = Stride; + RI.Struct.Alignment = Alignment; + return RI; +} + +ResourceInfo ResourceInfo::Texture2DMS(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, ElementType ElementTy, + uint32_t ElementCount, + uint32_t SampleCount) { + ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name, + Binding, UniqueID); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + RI.MultiSample.Count = SampleCount; + return RI; +} + +ResourceInfo ResourceInfo::Texture2DMSArray( + Value *Symbol, StringRef Name, ResourceBinding Binding, uint32_t UniqueID, + ElementType ElementTy, uint32_t ElementCount, uint32_t SampleCount) { + ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMSArray, Symbol, + Name, Binding, UniqueID); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + RI.MultiSample.Count = SampleCount; + return RI; +} + +ResourceInfo ResourceInfo::UAV(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + ElementType ElementTy, uint32_t ElementCount, + bool GloballyCoherent, bool IsROV, + ResourceKind Kind) { + ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name, Binding, UniqueID); + assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) && + "Invalid ResourceKind for UAV constructor."); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + RI.UAVFlags.GloballyCoherent = GloballyCoherent; + RI.UAVFlags.IsROV = IsROV; + RI.UAVFlags.HasCounter = false; + return RI; +} + +ResourceInfo ResourceInfo::RWRawBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, bool GloballyCoherent, + bool IsROV) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name, + Binding, UniqueID); + RI.UAVFlags.GloballyCoherent = GloballyCoherent; + RI.UAVFlags.IsROV = IsROV; + RI.UAVFlags.HasCounter = false; + return RI; +} + +ResourceInfo ResourceInfo::RWStructuredBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, + uint32_t Stride, Align Alignment, + bool GloballyCoherent, bool IsROV, + bool HasCounter) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::StructuredBuffer, Symbol, + Name, Binding, UniqueID); + RI.Struct.Stride = Stride; + RI.Struct.Alignment = Alignment; + RI.UAVFlags.GloballyCoherent = GloballyCoherent; + RI.UAVFlags.IsROV = IsROV; + RI.UAVFlags.HasCounter = HasCounter; + return RI; +} + +ResourceInfo +ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + ElementType ElementTy, uint32_t ElementCount, + uint32_t SampleCount, bool GloballyCoherent) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name, + Binding, UniqueID); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + RI.UAVFlags.GloballyCoherent = GloballyCoherent; + RI.UAVFlags.IsROV = false; + RI.UAVFlags.HasCounter = false; + RI.MultiSample.Count = SampleCount; + return RI; +} + +ResourceInfo +ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + ElementType ElementTy, uint32_t ElementCount, + uint32_t SampleCount, bool GloballyCoherent) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMSArray, Symbol, + Name, Binding, UniqueID); + RI.Typed.ElementTy = ElementTy; + RI.Typed.ElementCount = ElementCount; + RI.UAVFlags.GloballyCoherent = GloballyCoherent; + RI.UAVFlags.IsROV = false; + RI.UAVFlags.HasCounter = false; + RI.MultiSample.Count = SampleCount; + return RI; +} + +ResourceInfo ResourceInfo::FeedbackTexture2D(Value *Symbol, StringRef Name, + ResourceBinding Binding, + uint32_t UniqueID, + SamplerFeedbackType FeedbackTy) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2D, Symbol, + Name, Binding, UniqueID); + RI.UAVFlags.GloballyCoherent = false; + RI.UAVFlags.IsROV = false; + RI.UAVFlags.HasCounter = false; + RI.Feedback.Type = FeedbackTy; + return RI; +} + +ResourceInfo +ResourceInfo::FeedbackTexture2DArray(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + SamplerFeedbackType FeedbackTy) { + ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2DArray, + Symbol, Name, Binding, UniqueID); + RI.UAVFlags.GloballyCoherent = false; + RI.UAVFlags.IsROV = false; + RI.UAVFlags.HasCounter = false; + RI.Feedback.Type = FeedbackTy; + return RI; +} + +ResourceInfo ResourceInfo::CBuffer(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + uint32_t Size) { + ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name, + Binding, UniqueID); + RI.CBufferSize = Size; + return RI; +} + +ResourceInfo ResourceInfo::Sampler(Value *Symbol, StringRef Name, + ResourceBinding Binding, uint32_t UniqueID, + SamplerType SamplerTy) { + ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name, + Binding, UniqueID); + RI.SamplerTy = SamplerTy; + return RI; +} + +bool ResourceInfo::operator==(const ResourceInfo &RHS) const { + if (std::tie(Symbol, Name, Binding, UniqueID, RC, Kind) != + std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.UniqueID, RHS.RC, + RHS.Kind)) + return false; + if (isCBuffer()) + return CBufferSize == RHS.CBufferSize; + if (isSampler()) + return SamplerTy == RHS.SamplerTy; + if (isUAV() && UAVFlags != RHS.UAVFlags) + return false; + + if (isStruct()) + return Struct == RHS.Struct; + if (isFeedback()) + return Feedback == RHS.Feedback; + if (isTyped() && Typed != RHS.Typed) + return false; + + if (isMultiSample()) + return MultiSample == RHS.MultiSample; + + assert((Kind == ResourceKind::RawBuffer) && "Unhandled resource kind"); + return true; +} + +MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const { + SmallVector MDVals; + + Type *I32Ty = Type::getInt32Ty(Ctx); + Type *I1Ty = Type::getInt1Ty(Ctx); + auto getIntMD = [&I32Ty](uint32_t V) { + return ConstantAsMetadata::get( + Constant::getIntegerValue(I32Ty, APInt(32, V))); + }; + auto getBoolMD = [&I1Ty](uint32_t V) { + return ConstantAsMetadata::get( + Constant::getIntegerValue(I1Ty, APInt(1, V))); + }; + + MDVals.push_back(getIntMD(UniqueID)); + MDVals.push_back(ValueAsMetadata::get(Symbol)); + MDVals.push_back(MDString::get(Ctx, Name)); + MDVals.push_back(getIntMD(Binding.Space)); + MDVals.push_back(getIntMD(Binding.LowerBound)); + MDVals.push_back(getIntMD(Binding.Size)); + + if (isCBuffer()) { + MDVals.push_back(getIntMD(CBufferSize)); + MDVals.push_back(nullptr); + } else if (isSampler()) { + MDVals.push_back(getIntMD(llvm::to_underlying(SamplerTy))); + MDVals.push_back(nullptr); + } else { + MDVals.push_back(getIntMD(llvm::to_underlying(Kind))); + + if (isUAV()) { + MDVals.push_back(getBoolMD(UAVFlags.GloballyCoherent)); + MDVals.push_back(getBoolMD(UAVFlags.HasCounter)); + MDVals.push_back(getBoolMD(UAVFlags.IsROV)); + } else { + // All SRVs include sample count in the metadata, but it's only meaningful + // for multi-sampled textured. Also, UAVs can be multisampled in SM6.7+, + // but this just isn't reflected in the metadata at all. + uint32_t SampleCount = isMultiSample() ? MultiSample.Count : 0; + MDVals.push_back(getIntMD(SampleCount)); + } + + // Further properties are attached to a metadata list of tag-value pairs. + SmallVector Tags; + if (isStruct()) { + Tags.push_back( + getIntMD(llvm::to_underlying(ExtPropTags::StructuredBufferStride))); + Tags.push_back(getIntMD(Struct.Stride)); + } else if (isTyped()) { + Tags.push_back(getIntMD(llvm::to_underlying(ExtPropTags::ElementType))); + Tags.push_back(getIntMD(llvm::to_underlying(Typed.ElementTy))); + } else if (isFeedback()) { + Tags.push_back( + getIntMD(llvm::to_underlying(ExtPropTags::SamplerFeedbackKind))); + Tags.push_back(getIntMD(llvm::to_underlying(Feedback.Type))); + } + MDVals.push_back(Tags.empty() ? nullptr : MDNode::get(Ctx, Tags)); + } + + return MDNode::get(Ctx, MDVals); +} + +std::pair ResourceInfo::getAnnotateProps() const { + uint32_t ResourceKind = llvm::to_underlying(Kind); + uint32_t AlignLog2 = isStruct() ? Log2(Struct.Alignment) : 0; + bool IsUAV = isUAV(); + bool IsROV = IsUAV ? UAVFlags.IsROV : 0; + bool IsGloballyCoherent = IsUAV ? UAVFlags.GloballyCoherent : 0; + uint8_t SamplerCmpOrHasCounter = 0; + if (IsUAV) + SamplerCmpOrHasCounter = UAVFlags.HasCounter; + else if (isSampler()) + SamplerCmpOrHasCounter = SamplerTy == SamplerType::Comparison; + + // TODO: Document this format. Currently the only reference is the + // implementation of dxc's DxilResourceProperties struct. + uint32_t Word0 = 0; + Word0 |= ResourceKind & 0xFF; + Word0 |= (AlignLog2 & 0xF) << 8; + Word0 |= (IsUAV & 1) << 12; + Word0 |= (IsROV & 1) << 13; + Word0 |= (IsGloballyCoherent & 1) << 14; + Word0 |= (SamplerCmpOrHasCounter & 1) << 15; + + uint32_t Word1 = 0; + if (isStruct()) + Word1 = Struct.Stride; + else if (isCBuffer()) + Word1 = CBufferSize; + else if (isFeedback()) + Word1 = llvm::to_underlying(Feedback.Type); + else if (isTyped()) { + uint32_t CompType = llvm::to_underlying(Typed.ElementTy); + uint32_t CompCount = Typed.ElementCount; + uint32_t SampleCount = isMultiSample() ? MultiSample.Count : 0; + + Word1 |= (CompType & 0xFF) << 0; + Word1 |= (CompCount & 0xFF) << 8; + Word1 |= (SampleCount & 0xFF) << 16; + } + + return {Word0, Word1}; +} + +#define DEBUG_TYPE "dxil-resource" diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 6cb6540d1a7b64..9c9fc7a49a9d18 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1815,10 +1815,9 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // Iterate over all instructions, updating metadata and debug-info records. for (; FI != Fn->end(); ++FI) { - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; - ++BI) { - UpdateInst(*BI); - for (DbgRecord &DVR : BI->getDbgRecordRange()) { + for (Instruction &I : *FI) { + UpdateInst(I); + for (DbgRecord &DVR : I.getDbgRecordRange()) { UpdateDVR(&DVR); } } diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index a127a3265758d1..ff93035ce0652f 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1192,6 +1192,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, } } +Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src, + const RecurrenceDescriptor &Desc) { + RecurKind Kind = Desc.getRecurrenceKind(); + assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + "AnyOf reduction is not supported."); + auto *SrcTy = cast(Src->getType()); + Type *SrcEltTy = SrcTy->getElementType(); + Value *Iden = + Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); + Value *Ops[] = {Iden, Src}; + return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops); +} + Value *llvm::createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, PHINode *OrigPhi) { @@ -1220,6 +1233,20 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, return B.CreateFAddReduce(Start, Src); } +Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, + const RecurrenceDescriptor &Desc, + Value *Src, Value *Start) { + assert((Desc.getRecurrenceKind() == RecurKind::FAdd || + Desc.getRecurrenceKind() == RecurKind::FMulAdd) && + "Unexpected reduction kind"); + assert(Src->getType()->isVectorTy() && "Expected a vector type"); + assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); + + auto *SrcTy = cast(Src->getType()); + Value *Ops[] = {Start, Src}; + return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops); +} + void llvm::propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue, bool IncludeWrapFlags) { auto *VecOp = dyn_cast(I); diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 55f35448af6376..b5c4e93be574ba 100644 --- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -369,7 +369,7 @@ void ProcessSwitchInst(SwitchInst *SI, const unsigned NumSimpleCases = Clusterify(Cases, SI); IntegerType *IT = cast(SI->getCondition()->getType()); const unsigned BitWidth = IT->getBitWidth(); - // Explictly use higher precision to prevent unsigned overflow where + // Explicitly use higher precision to prevent unsigned overflow where // `UnsignedMax - 0 + 1 == 0` APInt UnsignedZero(BitWidth + 1, 0); APInt UnsignedMax = APInt::getMaxValue(BitWidth); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 3fa3c0f1f52b02..8f717cb43bcb45 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7573,11 +7573,31 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu return false; if (C->isNullValue() || isa(C)) { - // Only look at the first use, avoid hurting compile time with long uselists - auto *Use = cast(*I->user_begin()); + // Only look at the first use we can handle, avoid hurting compile time with + // long uselists + auto FindUse = llvm::find_if(I->users(), [](auto *U) { + auto *Use = cast(U); + // Change this list when we want to add new instructions. + switch (Use->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + case Instruction::Ret: + case Instruction::BitCast: + case Instruction::Load: + case Instruction::Store: + case Instruction::Call: + case Instruction::CallBr: + case Instruction::Invoke: + return true; + } + }); + if (FindUse == I->user_end()) + return false; + auto *Use = cast(*FindUse); // Bail out if Use is not in the same BB as I or Use == I or Use comes - // before I in the block. The latter two can be the case if Use is a PHI - // node. + // before I in the block. The latter two can be the case if Use is a + // PHI node. if (Use->getParent() != I->getParent() || Use == I || Use->comesBefore(I)) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7d37d67cde29c1..1481ddffe6b269 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -986,9 +986,12 @@ void reportVectorizationFailure(const StringRef DebugMsg, << "loop not vectorized: " << OREMsg); } -void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, +/// Reports an informative message: print \p Msg for debugging purposes as well +/// as an optimization remark. Uses either \p I as location of the remark, or +/// otherwise \p TheLoop. +static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I) { + Instruction *I = nullptr) { LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); ORE->emit( @@ -1516,9 +1519,7 @@ class LoopVectorizationCostModel { TTI.hasActiveVectorLength(0, nullptr, Align()) && !EnableVPlanNativePath && // FIXME: implement support for max safe dependency distance. - Legal->isSafeForAnyVectorWidth() && - // FIXME: remove this once reductions are supported. - Legal->getReductionVars().empty(); + Legal->isSafeForAnyVectorWidth(); if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail @@ -8693,6 +8694,14 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); + // Exit values for inductions are computed and updated outside of VPlan and + // independent of induction recipes. + // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update + // live-outs. + if ((isa(V) && + !cast(V)->getTruncInst()) || + isa(V)) + continue; Plan.addLiveOut(&ExitPhi, V); } } @@ -10275,7 +10284,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = LVP.getBestPlan(); + assert(BestPlan.hasScalarVFOnly() && + "VPlan cost model and legacy cost model disagreed"); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5c2fc0b9320e89..7b981bead6bb89 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9316,7 +9316,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, function_ref VectorCost) { // Calculate the cost of this instruction. InstructionCost ScalarCost = 0; - if (isa(VL0)) { + if (isa(VL0)) { // For some of the instructions no need to calculate cost for each // particular instruction, we can use the cost of the single // instruction x total number of scalar instructions. @@ -9637,9 +9637,33 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; - return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy, - Builder.getInt1Ty(), CurrentPred, CostKind, - VI); + InstructionCost ScalarCost = TTI->getCmpSelInstrCost( + E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred, + CostKind, VI); + auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI); + if (MinMaxID != Intrinsic::not_intrinsic) { + Type *CanonicalType = OrigScalarTy; + if (CanonicalType->isPtrOrPtrVectorTy()) + CanonicalType = CanonicalType->getWithNewType(IntegerType::get( + CanonicalType->getContext(), + DL->getTypeSizeInBits(CanonicalType->getScalarType()))); + + IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, + {CanonicalType, CanonicalType}); + InstructionCost IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + // If the selects are the only uses of the compares, they will be + // dead and we can adjust the cost by removing their cost. + if (SelectOnly) { + auto *CI = cast(VI->getOperand(0)); + IntrinsicCost -= TTI->getCmpSelInstrCost( + CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), + CI->getPredicate(), CostKind, CI); + } + ScalarCost = std::min(ScalarCost, IntrinsicCost); + } + + return ScalarCost; }; auto GetVectorCost = [&](InstructionCost CommonCost) { auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); @@ -9649,17 +9673,24 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Check if it is possible and profitable to use min/max for selects // in VL. // - auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); - if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { - IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, - {VecTy, VecTy}); + auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL); + if (MinMaxID != Intrinsic::not_intrinsic) { + Type *CanonicalType = VecTy; + if (CanonicalType->isPtrOrPtrVectorTy()) + CanonicalType = CanonicalType->getWithNewType(IntegerType::get( + CanonicalType->getContext(), + DL->getTypeSizeInBits(CanonicalType->getScalarType()))); + IntrinsicCostAttributes CostAttrs(MinMaxID, VecTy, {VecTy, VecTy}); InstructionCost IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); // If the selects are the only uses of the compares, they will be // dead and we can adjust the cost by removing their cost. - if (IntrinsicAndUse.second) - IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, + if (SelectOnly) { + auto *CI = + cast(cast(VL.front())->getOperand(0)); + IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy, MaskTy, VecPred, CostKind); + } VecCost = std::min(VecCost, IntrinsicCost); } return VecCost + CommonCost; @@ -14201,9 +14232,23 @@ Value *BoUpSLP::vectorizeTree( for (Instruction *I : RemovedInsts) { if (getTreeEntry(I)->Idx != 0) continue; + SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { + // Do not replace condition of the logical op in form select . + bool IsPoisoningLogicalOp = isa(U.getUser()) && + (match(U.getUser(), m_LogicalAnd()) || + match(U.getUser(), m_LogicalOr())) && + U.getOperandNo() == 0; + if (IsPoisoningLogicalOp) { + LogicalOpSelects.push_back(cast(U.getUser())); + return false; + } return UserIgnoreList->contains(U.getUser()); }); + // Replace conditions of the poisoning logical ops with the non-poison + // constant value. + for (SelectInst *SI : LogicalOpSelects) + SI->setCondition(Constant::getNullValue(SI->getCondition()->getType())); } } // Retain to-be-deleted instructions for some debug-info bookkeeping and alias @@ -18240,6 +18285,14 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) return false; + if (MaxVFOnly && BuildVectorOpds.size() == 2) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI) + << "Cannot SLP vectorize list: only 2 elements of buildvalue, " + "trying reduction first."; + }); + return false; + } LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register. return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly); @@ -18256,6 +18309,14 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; + if (MaxVFOnly && BuildVectorInsts.size() == 2) { + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI) + << "Cannot SLP vectorize list: only 2 elements of buildvector, " + "trying reduction first."; + }); + return false; + } LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e3b78700e10181..805d9d91fc1860 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -909,6 +909,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPEVLBasedIVPHISC: case VPRecipeBase::VPExpandSCEVSC: case VPRecipeBase::VPInstructionSC: + case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: @@ -2171,17 +2172,27 @@ class VPReductionRecipe : public VPSingleDefRecipe { /// The recurrence decriptor for the reduction in question. const RecurrenceDescriptor &RdxDesc; bool IsOrdered; + /// Whether the reduction is conditional. + bool IsConditional = false; + +protected: + VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, + Instruction *I, ArrayRef Operands, + VPValue *CondOp, bool IsOrdered) + : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) { + if (CondOp) { + IsConditional = true; + addOperand(CondOp); + } + } public: VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, bool IsOrdered) - : VPSingleDefRecipe(VPDef::VPReductionSC, - ArrayRef({ChainOp, VecOp}), I), - RdxDesc(R), IsOrdered(IsOrdered) { - if (CondOp) - addOperand(CondOp); - } + : VPReductionRecipe(VPDef::VPReductionSC, R, I, + ArrayRef({ChainOp, VecOp}), CondOp, + IsOrdered) {} ~VPReductionRecipe() override = default; @@ -2190,7 +2201,15 @@ class VPReductionRecipe : public VPSingleDefRecipe { getVecOp(), getCondOp(), IsOrdered); } - VP_CLASSOF_IMPL(VPDef::VPReductionSC) + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionSC || + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + } + + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); + } /// Generate the reduction in the loop void execute(VPTransformState &State) override; @@ -2201,13 +2220,62 @@ class VPReductionRecipe : public VPSingleDefRecipe { VPSlotTracker &SlotTracker) const override; #endif + /// Return the recurrence decriptor for the in-loop reduction. + const RecurrenceDescriptor &getRecurrenceDescriptor() const { + return RdxDesc; + } + /// Return true if the in-loop reduction is ordered. + bool isOrdered() const { return IsOrdered; }; + /// Return true if the in-loop reduction is conditional. + bool isConditional() const { return IsConditional; }; /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } /// The VPValue of the vector value to be reduced. VPValue *getVecOp() const { return getOperand(1); } /// The VPValue of the condition for the block. VPValue *getCondOp() const { - return getNumOperands() > 2 ? getOperand(2) : nullptr; + return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; + } +}; + +/// A recipe to represent inloop reduction operations with vector-predication +/// intrinsics, performing a reduction on a vector operand with the explicit +/// vector length (EVL) into a scalar value, and adding the result to a chain. +/// The Operands are {ChainOp, VecOp, EVL, [Condition]}. +class VPReductionEVLRecipe : public VPReductionRecipe { +public: + VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp) + : VPReductionRecipe( + VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(), + cast_or_null(R->getUnderlyingValue()), + ArrayRef({R->getChainOp(), R->getVecOp(), EVL}), CondOp, + R->isOrdered()) {} + + ~VPReductionEVLRecipe() override = default; + + VPReductionEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC) + + /// Generate the reduction in the loop + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(2); } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getEVL(); } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 36d52b255232a0..6d89ad9fee8ad7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -274,6 +274,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const VPScalarCastRecipe *R) { return R->getResultType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); + }) + .Case([this](const auto *R) { + return inferScalarType(R->getChainOp()); }); assert(ResultTy && "could not infer type for the given VPValue"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 53d91ee27b73f0..4b1ac79bbfdd4e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -63,6 +63,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPPredInstPHISC: return false; case VPBlendSC: + case VPReductionEVLSC: case VPReductionSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -104,6 +105,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenStoreSC: return false; case VPBlendSC: + case VPReductionEVLSC: case VPReductionSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -151,6 +153,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); } case VPBlendSC: + case VPReductionEVLSC: case VPReductionSC: case VPScalarIVStepsSC: case VPWidenCanonicalIVSC: @@ -1744,6 +1747,46 @@ void VPReductionRecipe::execute(VPTransformState &State) { } } +void VPReductionEVLRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Reduction being replicated."); + assert(State.UF == 1 && + "Expected only UF == 1 when vectorizing with explicit vector length."); + + auto &Builder = State.Builder; + // Propagate the fast-math flags carried by the underlying instruction. + IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); + const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + + RecurKind Kind = RdxDesc.getRecurrenceKind(); + Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true); + Value *VecOp = State.get(getVecOp(), 0); + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL); + Value *Mask; + // TODO: move the all-true mask generation into VectorBuilder. + if (VPValue *CondOp = getCondOp()) + Mask = State.get(CondOp, 0); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + VBuilder.setMask(Mask); + + Value *NewRed; + if (isOrdered()) { + NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); + } else { + NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) + NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); + else + NewRed = Builder.CreateBinOp( + (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev); + } + State.set(this, NewRed, 0, /*IsScalar*/ true); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -1756,7 +1799,31 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << getUnderlyingInstr()->getFastMathFlags(); O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); - if (getCondOp()) { + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; + if (RdxDesc.IntermediateStore) + O << " (with final reduction value stored in invariant address sank " + "outside of loop)"; +} + +void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); + O << Indent << "REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " +"; + if (isa(getUnderlyingInstr())) + O << getUnderlyingInstr()->getFastMathFlags(); + O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b3396a0a46af4c..d668ae2aa5c089 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1427,11 +1427,20 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { // The transform updates all users of inductions to work based on EVL, instead // of the VF directly. At the moment, widened inductions cannot be updated, so // bail out if the plan contains any. - if (any_of(Header->phis(), [](VPRecipeBase &Phi) { - return (isa(&Phi) || - isa(&Phi)); - })) + bool ContainsWidenInductions = any_of(Header->phis(), [](VPRecipeBase &Phi) { + return isa( + &Phi); + }); + // FIXME: Remove this once we can transform (select header_mask, true_value, + // false_value) into vp.merge. + bool ContainsOutloopReductions = + any_of(Header->phis(), [&](VPRecipeBase &Phi) { + auto *R = dyn_cast(&Phi); + return R && !R->isInLoop(); + }); + if (ContainsWidenInductions || ContainsOutloopReductions) return false; + auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -1462,23 +1471,42 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *MemR = dyn_cast(U); - if (!MemR) + VPRecipeBase *NewRecipe = nullptr; + auto *CurRecipe = dyn_cast(U); + if (!CurRecipe) continue; - VPValue *OrigMask = MemR->getMask(); - assert(OrigMask && "Unmasked widen memory recipe when folding tail"); - VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; - if (auto *L = dyn_cast(MemR)) { - auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); - N->insertBefore(L); - L->replaceAllUsesWith(N); - L->eraseFromParent(); - } else if (auto *S = dyn_cast(MemR)) { - auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); - N->insertBefore(S); - S->eraseFromParent(); - } else { - llvm_unreachable("unsupported recipe"); + + auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { + assert(OrigMask && "Unmasked recipe when folding tail"); + return HeaderMask == OrigMask ? nullptr : OrigMask; + }; + if (auto *MemR = dyn_cast(CurRecipe)) { + VPValue *NewMask = GetNewMask(MemR->getMask()); + if (auto *L = dyn_cast(MemR)) + NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + else if (auto *S = dyn_cast(MemR)) + NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + else + llvm_unreachable("unsupported recipe"); + } else if (auto *RedR = dyn_cast(CurRecipe)) { + NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL, + GetNewMask(RedR->getCondOp())); + } + + if (NewRecipe) { + [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues(); + assert(NumDefVal == CurRecipe->getNumDefinedValues() && + "New recipe must define the same number of values as the " + "original."); + assert( + NumDefVal <= 1 && + "Only supports recipes with a single definition or without users."); + NewRecipe->insertBefore(CurRecipe); + if (isa(NewRecipe)) { + VPValue *CurVPV = CurRecipe->getVPSingleValue(); + CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); + } + CurRecipe->eraseFromParent(); } } recursivelyDeleteDeadRecipes(HeaderMask); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index fa6a65ff2f3ada..452c977106a773 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -341,6 +341,7 @@ class VPDef { VPExpandSCEVSC, VPInstructionSC, VPInterleaveSC, + VPReductionEVLSC, VPReductionSC, VPReplicateSC, VPScalarCastSC, diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4896b8ed2595bb..3a49f95d3f1176 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1859,13 +1859,19 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (!FrontU) return false; + // Helper to peek through bitcasts to the same value. + auto IsEquiv = [&](Value *X, Value *Y) { + return X->getType() == Y->getType() && + peekThroughBitcasts(X) == peekThroughBitcasts(Y); + }; + // Look for an identity value. if (FrontLane == 0 && cast(FrontU->get()->getType())->getNumElements() == Ty->getNumElements() && - all_of(drop_begin(enumerate(Item)), [Item](const auto &E) { + all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) { Value *FrontV = Item.front().first->get(); - return !E.value().first || (E.value().first->get() == FrontV && + return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) && E.value().second == (int)E.index()); })) { IdentityLeafs.insert(FrontU); diff --git a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll index df1e542049ea0e..4e6a36059d8159 100644 --- a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll +++ b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt -mattr=-neon < %s -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt -mattr=+sve,-neon < %s -passes="print" 2>&1 -disable-output | FileCheck %s target triple = "aarch64-unknown-linux-gnu" target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -define void @uitofp() #0 { +define void @uitofp() { ; CHECK-LABEL: 'uitofp' ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %conv = uitofp <16 x i64> undef to <16 x float> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -12,5 +13,3 @@ define void @uitofp() #0 { %conv = uitofp <16 x i64> undef to <16 x float> ret void } - -attributes #0 = { "target-features"="-neon" } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-access-types-rt-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-access-types-rt-checks.ll new file mode 100644 index 00000000000000..58844c10cdcb95 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/different-access-types-rt-checks.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + +define void @loads_of_same_pointer_with_different_sizes1(ptr %A, ptr %B, i64 %N) { +; CHECK-LABEL: 'loads_of_same_pointer_with_different_sizes1' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %B High: ((4 * %N) + %B)) +; CHECK-NEXT: Member: {%B,+,4}<%loop> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %A High: (3 + %N + %A)) +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv + %l0 = load i8, ptr %gep.A, align 1 + %l1 = load i32, ptr %gep.A, align 1 + %l0.ext = sext i8 %l0 to i32 + %iv.trunc = trunc nuw i64 %iv to i32 + %sub.0 = sub i32 %l0.ext, %iv.trunc + %sub.1 = sub i32 %l1, %sub.0 + %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv + store i32 %sub.1, ptr %gep.B, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +define void @loads_of_same_pointer_with_different_sizes2(ptr %A, ptr %B, i64 %N) { +; CHECK-LABEL: 'loads_of_same_pointer_with_different_sizes2' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: %B High: ((4 * %N) + %B)) +; CHECK-NEXT: Member: {%B,+,4}<%loop> +; CHECK-NEXT: Group [[GRP4]]: +; CHECK-NEXT: (Low: %A High: (3 + %N + %A)) +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv + %l1 = load i32, ptr %gep.A, align 1 + %l0 = load i8, ptr %gep.A, align 1 + %l0.ext = sext i8 %l0 to i32 + %iv.trunc = trunc nuw i64 %iv to i32 + %sub.0 = sub i32 %l0.ext, %iv.trunc + %sub.1 = sub i32 %l1, %sub.0 + %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv + store i32 %sub.1, ptr %gep.B, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +define void @loads_of_same_pointer_with_different_sizes_retry_with_runtime_checks(ptr %A, ptr %B, i64 %N, i64 %off) { +; CHECK-LABEL: 'loads_of_same_pointer_with_different_sizes_retry_with_runtime_checks' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.B.iv = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.B.inc = getelementptr inbounds i32, ptr %B, i64 %inc +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.B.iv = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group ([[GRP7:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.B.iv = getelementptr inbounds i32, ptr %B, i64 %iv +; CHECK-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %gep.B.inc = getelementptr inbounds i32, ptr %B, i64 %inc +; CHECK-NEXT: Against group ([[GRP7]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %gep.B.inc = getelementptr inbounds i32, ptr %B, i64 %inc +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: %B High: ((4 * %N) + %B)) +; CHECK-NEXT: Member: {%B,+,4}<%loop> +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: ((4 * %off) + %B) High: ((4 * %N) + (4 * %off) + %B)) +; CHECK-NEXT: Member: {((4 * %off) + %B),+,4}<%loop> +; CHECK-NEXT: Group [[GRP7]]: +; CHECK-NEXT: (Low: %A High: (%N + %A)) +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-NEXT: Group [[GRP8]]: +; CHECK-NEXT: (Low: %A High: (3 + %N + %A)) +; CHECK-NEXT: Member: {%A,+,1}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv + %l0 = load i8, ptr %gep.A, align 1 + %l1 = load i32, ptr %gep.A, align 1 + %l0.ext = sext i8 %l0 to i32 + %iv.trunc = trunc nuw i64 %iv to i32 + %sub.0 = sub i32 %l0.ext, %iv.trunc + %sub.1 = sub i32 %l1, %iv.trunc + %gep.B.iv = getelementptr inbounds i32, ptr %B, i64 %iv + store i32 %sub.0, ptr %gep.B.iv, align 4 + %inc = add nuw nsw i64 %iv, %off + %gep.B.inc = getelementptr inbounds i32, ptr %B, i64 %inc + store i32 %sub.1, ptr %gep.B.inc , align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Analysis/ValueTracking/known-bits.ll b/llvm/test/Analysis/ValueTracking/known-bits.ll new file mode 100644 index 00000000000000..035ccf8d42d13d --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/known-bits.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=instsimplify < %s -S | FileCheck %s + +define <4 x i1> @vec_reverse_known_bits(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_known_bits( +; CHECK-NEXT: ret <4 x i1> +; + %x = or <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %r = icmp slt <4 x i8> %rev, zeroinitializer + ret <4 x i1> %r +} + +define <4 x i1> @vec_reverse_known_bits_fail(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_known_bits_fail( +; CHECK-NEXT: [[X:%.*]] = or <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[R:%.*]] = icmp slt <4 x i8> [[REV]], zeroinitializer +; CHECK-NEXT: ret <4 x i1> [[R]] +; + %x = or <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %r = icmp slt <4 x i8> %rev, zeroinitializer + ret <4 x i1> %r +} diff --git a/llvm/test/Analysis/ValueTracking/known-fpclass.ll b/llvm/test/Analysis/ValueTracking/known-fpclass.ll new file mode 100644 index 00000000000000..59f3eed715b521 --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/known-fpclass.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=instsimplify < %s -S | FileCheck %s + +define <4 x i1> @vector_reverse_fpclass(<4 x double> nofpclass(nzero nan) %x) { +; CHECK-LABEL: @vector_reverse_fpclass( +; CHECK-NEXT: ret <4 x i1> +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call <4 x double> @llvm.vector.reverse(<4 x double> %x.abs) + %cmp = fcmp oge <4 x double> %op, + ret <4 x i1> %cmp +} + +define <4 x i1> @vector_reverse_fpclass2(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: @vector_reverse_fpclass2( +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X:%.*]]) +; CHECK-NEXT: [[OP:%.*]] = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge <4 x double> [[OP]], zeroinitializer +; CHECK-NEXT: ret <4 x i1> [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call <4 x double> @llvm.vector.reverse(<4 x double> %x.abs) + %cmp = fcmp oge <4 x double> %op, + ret <4 x i1> %cmp +} + diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll index c00e47fba8c727..5704586d923002 100644 --- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll +++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll @@ -1497,4 +1497,27 @@ define i1 @trunc_nsw_nuw_non_zero_fail(i8 %xx) { ret i1 %r } +define <4 x i1> @vec_reverse_non_zero(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_non_zero( +; CHECK-NEXT: ret <4 x i1> zeroinitializer +; + %x = add nuw <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %r = icmp eq <4 x i8> %rev, zeroinitializer + ret <4 x i1> %r +} + +define <4 x i1> @vec_reverse_non_zero_fail(<4 x i8> %xx) { +; CHECK-LABEL: @vec_reverse_non_zero_fail( +; CHECK-NEXT: [[X:%.*]] = add nuw <4 x i8> [[XX:%.*]], +; CHECK-NEXT: [[REV:%.*]] = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> [[X]]) +; CHECK-NEXT: [[R:%.*]] = icmp eq <4 x i8> [[REV]], zeroinitializer +; CHECK-NEXT: ret <4 x i1> [[R]] +; + %x = add nuw <4 x i8> %xx, + %rev = call <4 x i8> @llvm.vector.reverse(<4 x i8> %x) + %r = icmp eq <4 x i8> %rev, zeroinitializer + ret <4 x i1> %r +} + declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1) diff --git a/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll new file mode 100644 index 00000000000000..e2fe873d715cd6 --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +define <4 x i1> @hadd_and_eq_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hadd_and_eq_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <4 x i1> zeroinitializer +; +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %andr = and <4 x i32> %hadd, + %ret = icmp eq <4 x i32> %andr, + ret <4 x i1> %ret +} + +define <8 x i1> @hadd_and_eq_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16( +; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> +; +entry: + %and1 = and <8 x i16> %x, + %and2 = and <8 x i16> %y, + %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2) + %andr = and <8 x i16> %hadd, + %ret = icmp eq <8 x i16> %andr, + ret <8 x i1> %ret +} + +define <8 x i1> @hadd_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i16_sat( +; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> +; +entry: + %and1 = and <8 x i16> %x, + %and2 = and <8 x i16> %y, + %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %and1, <8 x i16> %and2) + %andr = and <8 x i16> %hadd, + %ret = icmp eq <8 x i16> %andr, + ret <8 x i1> %ret +} + +define <8 x i1> @hadd_and_eq_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: define <8 x i1> @hadd_and_eq_v8i32( +; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> zeroinitializer +; +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %andr = and <8 x i32> %hadd, + %ret = icmp eq <8 x i32> %andr, + ret <8 x i1> %ret +} + +define <16 x i1> @hadd_and_eq_v16i16(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <16 x i1> +; +entry: + %and1 = and <16 x i16> %x, + %and2 = and <16 x i16> %y, + %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2) + %andr = and <16 x i16> %hadd, + %ret = icmp eq <16 x i16> %andr, + ret <16 x i1> %ret +} + +define <16 x i1> @hadd_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: define <16 x i1> @hadd_and_eq_v16i16_sat( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <16 x i1> +; +entry: + %and1 = and <16 x i16> %x, + %and2 = and <16 x i16> %y, + %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %and1, <16 x i16> %and2) + %andr = and <16 x i16> %hadd, + %ret = icmp eq <16 x i16> %andr, + ret <16 x i1> %ret +} + +define <4 x i1> @hsub_trunc_eq_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hsub_trunc_eq_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <4 x i1> zeroinitializer +; +entry: + %or1 = or <4 x i32> %x, + %or2 = or <4 x i32> %y, + %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2) + %conv = trunc <4 x i32> %hsub to <4 x i16> + %ret = icmp eq <4 x i16> %conv, + ret <4 x i1> %ret +} + +define <8 x i1> @hsub_trunc_eq_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i16( +; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> +; +entry: + %or1 = or <8 x i16> %x, + %or2 = or <8 x i16> %y, + %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2) + %conv = trunc <8 x i16> %hsub to <8 x i8> + %ret = icmp eq <8 x i8> %conv, + ret <8 x i1> %ret +} + +define <8 x i1> @hsub_and_eq_v8i16_sat(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: define <8 x i1> @hsub_and_eq_v8i16_sat( +; CHECK-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> +; +entry: + %or1 = or <8 x i16> %x, + %or2 = or <8 x i16> %y, + %and1 = and <8 x i16> %or1, + %and2 = and <8 x i16> %or2, + %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %and1, <8 x i16> %and2) + %ret = icmp sle <8 x i16> %hsub, + ret <8 x i1> %ret +} + +define <8 x i1> @hsub_trunc_eq_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: define <8 x i1> @hsub_trunc_eq_v8i32( +; CHECK-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <8 x i1> zeroinitializer +; +entry: + %or1 = or <8 x i32> %x, + %or2 = or <8 x i32> %y, + %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2) + %conv = trunc <8 x i32> %hsub to <8 x i16> + %ret = icmp eq <8 x i16> %conv, + ret <8 x i1> %ret +} + +define <16 x i1> @hsub_trunc_eq_v16i16(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: define <16 x i1> @hsub_trunc_eq_v16i16( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <16 x i1> +; +entry: + %or1 = or <16 x i16> %x, + %or2 = or <16 x i16> %y, + %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2) + %conv = trunc <16 x i16> %hsub to <16 x i8> + %ret = icmp eq <16 x i8> %conv, + ret <16 x i1> %ret +} + +define <16 x i1> @hsub_and_eq_v16i16_sat(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: define <16 x i1> @hsub_and_eq_v16i16_sat( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <16 x i1> +; +entry: + %or1 = or <16 x i16> %x, + %or2 = or <16 x i16> %y, + %and1 = and <16 x i16> %or1, + %and2 = and <16 x i16> %or2, + %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %and1, <16 x i16> %and2) + %ret = icmp sle <16 x i16> %hsub, + ret <16 x i1> %ret +} + +define <4 x i1> @hadd_shuffle_2st_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <4 x i1> +; +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> + %ret = icmp ne <4 x i32> %shuf, + ret <4 x i1> %ret +} + +define <4 x i1> @hadd_shuffle_4th_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <4 x i1> +; +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> + %ret = icmp ne <4 x i32> %shuf, + ret <4 x i1> %ret +} + +define <4 x i1> @hadd_shuffle_2st_negative_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hadd_shuffle_2st_negative_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[X]], +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[Y]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], +; CHECK-NEXT: ret <4 x i1> [[RET]] +; +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> + %ret = icmp ne <4 x i32> %shuf, + ret <4 x i1> %ret +} + +define <4 x i1> @hadd_shuffle_4th_negative_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i1> @hadd_shuffle_4th_negative_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[X]], +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[Y]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[RET:%.*]] = icmp ne <4 x i32> [[TMP3]], +; CHECK-NEXT: ret <4 x i1> [[RET]] +; +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %shuf = shufflevector <4 x i32> %hadd, <4x i32> zeroinitializer, <4 x i32> + %ret = icmp ne <4 x i32> %shuf, + ret <4 x i1> %ret +} diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll index dc6f2a9e7d2064..06e1c44e0c4908 100644 --- a/llvm/test/Bindings/llvm-c/echo.ll +++ b/llvm/test/Bindings/llvm-c/echo.ll @@ -25,6 +25,9 @@ module asm "classical GAS" @const_gep = global ptr getelementptr (i32, ptr @var, i64 2) @const_inbounds_gep = global ptr getelementptr inbounds (i32, ptr @var, i64 1) +@const_gep_nuw = global ptr getelementptr nuw (i32, ptr @var, i64 1) +@const_gep_nusw = global ptr getelementptr nusw (i32, ptr @var, i64 1) +@const_gep_nuw_inbounds = global ptr getelementptr nuw inbounds (i32, ptr @var, i64 1) @aliased1 = alias i32, ptr @var @aliased2 = internal alias i32, ptr @var @@ -391,6 +394,15 @@ bb_03: ret void } +define ptr @test_gep_no_wrap_flags(ptr %0) { + %gep.1 = getelementptr i8, ptr %0, i32 4 + %gep.inbounds = getelementptr inbounds i8, ptr %0, i32 4 + %gep.nuw = getelementptr nuw i8, ptr %0, i32 4 + %gep.nuw.inbounds = getelementptr inbounds nuw i8, ptr %0, i32 4 + %gep.nusw = getelementptr nusw i8, ptr %0, i32 4 + ret ptr %gep.nusw +} + !llvm.dbg.cu = !{!0, !2} !llvm.module.flags = !{!3} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir index 3263fdcdee6628..87bbbee35d6d62 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -global-isel-abort=1 -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -global-isel-abort=2 -verify-machineinstrs -o - | FileCheck %s --- name: legal_v4s32_v2s32 @@ -9,11 +9,12 @@ body: | liveins: $d0, $d1 ; CHECK-LABEL: name: legal_v4s32_v2s32 ; CHECK: liveins: $d0, $d1 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>) - ; CHECK: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>) - ; CHECK: RET_ReallyLR + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR %0:_(<2 x s32>) = COPY $d0 %1:_(<2 x s32>) = COPY $d1 %2:_(<4 x s32>) = G_CONCAT_VECTORS %0(<2 x s32>), %1(<2 x s32>) @@ -28,11 +29,12 @@ body: | liveins: $d0, $d1 ; CHECK-LABEL: name: legal_v8s16_v4s16 ; CHECK: liveins: $d0, $d1 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>) - ; CHECK: $q0 = COPY [[CONCAT_VECTORS]](<8 x s16>) - ; CHECK: RET_ReallyLR + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>) + ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR %0:_(<4 x s16>) = COPY $d0 %1:_(<4 x s16>) = COPY $d1 %2:_(<8 x s16>) = G_CONCAT_VECTORS %0(<4 x s16>), %1(<4 x s16>) @@ -47,14 +49,89 @@ body: | liveins: $q0 ; CHECK-LABEL: name: legal_v16s8_v8s8 ; CHECK: liveins: $q0 - ; CHECK: %a:_(<8 x s8>) = G_IMPLICIT_DEF - ; CHECK: %b:_(<8 x s8>) = G_IMPLICIT_DEF - ; CHECK: %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a(<8 x s8>), %b(<8 x s8>) - ; CHECK: $q0 = COPY %concat(<16 x s8>) - ; CHECK: RET_ReallyLR implicit $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %a:_(<8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: %b:_(<8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a(<8 x s8>), %b(<8 x s8>) + ; CHECK-NEXT: $q0 = COPY %concat(<16 x s8>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 %a:_(<8 x s8>) = G_IMPLICIT_DEF %b:_(<8 x s8>) = G_IMPLICIT_DEF %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a:_(<8 x s8>), %b:_(<8 x s8>) $q0 = COPY %concat(<16 x s8>) RET_ReallyLR implicit $q0 ... +--- +name: illegal_v16s8_v4s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: illegal_v16s8_v4s8 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %a:_(p0) = COPY $x0 + ; CHECK-NEXT: %b:_(s32) = G_LOAD %a(p0) :: (load (s32)) + ; CHECK-NEXT: %c:_(<4 x s8>) = G_BITCAST %b(s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF]](s16) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF]](s16) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST %c(<4 x s8>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>) + ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[BITCAST1]](s32), [[BITCAST2]](s32), [[BITCAST3]](s32) + ; CHECK-NEXT: %f:_(<16 x s8>) = G_BITCAST [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY %f(<16 x s8>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %a:_(p0) = COPY $x0 + %b:_(s32) = G_LOAD %a:_(p0) :: (load (s32)) + %c:_(<4 x s8>) = G_BITCAST %b:_(s32) + %d:_(s8) = G_IMPLICIT_DEF + %e:_(<4 x s8>) = G_BUILD_VECTOR %d:_(s8), %d:_(s8), %d:_(s8), %d:_(s8) + %f:_(<16 x s8>) = G_CONCAT_VECTORS %c:_(<4 x s8>), %e:_(<4 x s8>), %e:_(<4 x s8>), %e:_(<4 x s8>) + + $q0 = COPY %f(<16 x s8>) + RET_ReallyLR implicit $q0 +... +--- +name: illegal_v8s16_v2s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: illegal_v8s16_v2s16 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %a:_(p0) = COPY $x0 + ; CHECK-NEXT: %b:_(s32) = G_LOAD %a(p0) :: (load (s32)) + ; CHECK-NEXT: %c:_(<2 x s16>) = G_BITCAST %b(s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[TRUNC]](<4 x s16>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST %c(<2 x s16>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[BITCAST1]](s32), [[BITCAST2]](s32), [[BITCAST3]](s32) + ; CHECK-NEXT: %f:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY %f(<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %a:_(p0) = COPY $x0 + %b:_(s32) = G_LOAD %a:_(p0) :: (load (s32)) + %c:_(<2 x s16>) = G_BITCAST %b:_(s32) + %d:_(s16) = G_IMPLICIT_DEF + %e:_(<2 x s16>) = G_BUILD_VECTOR %d:_(s16), %d:_(s16) + %f:_(<8 x s16>) = G_CONCAT_VECTORS %c:_(<2 x s16>), %e:_(<2 x s16>), %e:_(<2 x s16>), %e:_(<2 x s16>) + + $q0 = COPY %f(<8 x s16>) + RET_ReallyLR implicit $q0 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 1f048528ea1538..6db0b9326ca477 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -119,8 +119,8 @@ # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_CONCAT_VECTORS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_PTRTOINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir index a3094225a031a8..0890f746fa0faa 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir @@ -635,79 +635,3 @@ body: | %1(s64) = G_FPTOUI %0 $x0 = COPY %1(s64) ... - ---- -name: sitofp_v2s64_v2s32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $d0 - - ; CHECK-LABEL: name: sitofp_v2s64_v2s32 - ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[SSHLLv2i32_shift:%[0-9]+]]:fpr128 = SSHLLv2i32_shift [[COPY]], 0 - ; CHECK: [[SCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept SCVTFv2f64 [[SSHLLv2i32_shift]] - ; CHECK: $q0 = COPY [[SCVTFv2f64_]] - %0:fpr(<2 x s32>) = COPY $d0 - %1:fpr(<2 x s64>) = G_SITOFP %0 - $q0 = COPY %1(<2 x s64>) -... - ---- -name: uitofp_v2s64_v2s32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $d0 - - ; CHECK-LABEL: name: uitofp_v2s64_v2s32 - ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 - ; CHECK: [[USHLLv2i32_shift:%[0-9]+]]:fpr128 = USHLLv2i32_shift [[COPY]], 0 - ; CHECK: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 [[USHLLv2i32_shift]] - ; CHECK: $q0 = COPY [[UCVTFv2f64_]] - %0:fpr(<2 x s32>) = COPY $d0 - %1:fpr(<2 x s64>) = G_UITOFP %0 - $q0 = COPY %1(<2 x s64>) -... - ---- -name: sitofp_v2s32_v2s64 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $q0 - - ; CHECK-LABEL: name: sitofp_v2s32_v2s64 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[SCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept SCVTFv2f64 [[COPY]] - ; CHECK: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 [[SCVTFv2f64_]] - ; CHECK: $d0 = COPY [[FCVTNv2i32_]] - %0:fpr(<2 x s64>) = COPY $q0 - %1:fpr(<2 x s32>) = G_SITOFP %0 - $d0 = COPY %1(<2 x s32>) -... - ---- -name: uitofp_v2s32_v2s64 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $q0 - - ; CHECK-LABEL: name: uitofp_v2s32_v2s64 - ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 - ; CHECK: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 [[COPY]] - ; CHECK: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 [[UCVTFv2f64_]] - ; CHECK: $d0 = COPY [[FCVTNv2i32_]] - %0:fpr(<2 x s64>) = COPY $q0 - %1:fpr(<2 x s32>) = G_UITOFP %0 - $d0 = COPY %1(<2 x s32>) -... diff --git a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll index e4b534bfe0e37e..f49d469e50cdd7 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll @@ -997,7 +997,7 @@ define i64 @umull_ldr2_d(ptr %x0, i64 %x1) { ; CHECK-LABEL: umull_ldr2_d: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: and x9, x1, #0xffffffff +; CHECK-NEXT: mov w9, w1 ; CHECK-NEXT: umull x0, w8, w9 ; CHECK-NEXT: ret entry: @@ -1110,7 +1110,7 @@ define i64 @umaddl_ldr2_d(ptr %x0, i64 %x1, i64 %x2) { ; CHECK-LABEL: umaddl_ldr2_d: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: and x9, x1, #0xffffffff +; CHECK-NEXT: mov w9, w1 ; CHECK-NEXT: umaddl x0, w8, w9, x2 ; CHECK-NEXT: ret entry: @@ -1224,7 +1224,7 @@ define i64 @umnegl_ldr2_d(ptr %x0, i64 %x1) { ; CHECK-LABEL: umnegl_ldr2_d: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: and x9, x1, #0xffffffff +; CHECK-NEXT: mov w9, w1 ; CHECK-NEXT: umnegl x0, w8, w9 ; CHECK-NEXT: ret entry: @@ -1338,7 +1338,7 @@ define i64 @umsubl_ldr2_d(ptr %x0, i64 %x1, i64 %x2) { ; CHECK-LABEL: umsubl_ldr2_d: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: and x9, x1, #0xffffffff +; CHECK-NEXT: mov w9, w1 ; CHECK-NEXT: umsubl x0, w8, w9, x2 ; CHECK-NEXT: ret entry: @@ -1400,7 +1400,7 @@ define i64 @umull_and_lshr(i64 %x) { ; CHECK-LABEL: umull_and_lshr: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: and x9, x0, #0xffffffff +; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: umull x0, w9, w8 ; CHECK-NEXT: ret %lo = and i64 %x, u0xffffffff @@ -1424,7 +1424,7 @@ define i64 @umaddl_and_lshr(i64 %x, i64 %a) { ; CHECK-LABEL: umaddl_and_lshr: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: and x9, x0, #0xffffffff +; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: umaddl x0, w9, w8, x1 ; CHECK-NEXT: ret %lo = and i64 %x, u0xffffffff @@ -1437,8 +1437,8 @@ define i64 @umaddl_and_lshr(i64 %x, i64 %a) { define i64 @umaddl_and_and(i64 %x, i64 %y, i64 %a) { ; CHECK-LABEL: umaddl_and_and: ; CHECK: // %bb.0: -; CHECK-NEXT: and x8, x0, #0xffffffff -; CHECK-NEXT: and x9, x1, #0xffffffff +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov w9, w1 ; CHECK-NEXT: umaddl x0, w8, w9, x2 ; CHECK-NEXT: ret %lo = and i64 %x, u0xffffffff diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll index 493d503de2cc13..f005ca47ad124f 100644 --- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll +++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll @@ -549,3 +549,33 @@ define i64 @test_2_selects(i8 zeroext %a) { } declare i8 @llvm.usub.sat.i8(i8, i8) #0 + +define i64 @and0xffffffff(i64 %a) nounwind ssp { +; CHECK-LABEL: and0xffffffff: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov w0, w0 +; CHECK-NEXT: ret +entry: + %b = and i64 %a, u0xffffffff + ret i64 %b +} + +define i64 @and0xfffffff0(i64 %a) nounwind ssp { +; CHECK-LABEL: and0xfffffff0: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: and x0, x0, #0xfffffff0 +; CHECK-NEXT: ret +entry: + %b = and i64 %a, u0xfffffff0 + ret i64 %b +} + +define i64 @and0x7fffffff(i64 %a) nounwind ssp { +; CHECK-LABEL: and0x7fffffff: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: and x0, x0, #0x7fffffff +; CHECK-NEXT: ret +entry: + %b = and i64 %a, u0x7fffffff + ret i64 %b +} diff --git a/llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll index 761462be6b4b00..e9a550d07eb58b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll +++ b/llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll @@ -8,7 +8,7 @@ entry: store i64 %ext, ptr %addr, align 8 ; CHECK: adrp x{{[0-9]+}}, _x@GOTPAGE ; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF] -; CHECK-NEXT: and x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff +; CHECK-NEXT: mov w{{[0-9]+}}, w{{[0-9]+}} ; CHECK-NEXT: str x{{[0-9]+}}, [x{{[0-9]+}}] ret void } diff --git a/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll b/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll index ed71f0958604f1..64c5cfdfec75a3 100644 --- a/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll @@ -2,9 +2,9 @@ define i64 @test_memcpy(ptr %addr, ptr %src, i1 %tst) minsize { ; CHECK-LABEL: test_memcpy: -; CHECK: ldr [[VAL64:x[0-9]+]], [x0] +; CHECK: ldr x[[VAL64:[0-9]+]], [x0] ; [...] -; CHECK: and x0, [[VAL64]], #0xffffffff +; CHECK: mov w0, w[[VAL64]] ; CHECK: bl _memcpy %val64 = load i64, ptr %addr @@ -22,9 +22,9 @@ false: define i64 @test_memmove(ptr %addr, ptr %src, i1 %tst) minsize { ; CHECK-LABEL: test_memmove: -; CHECK: ldr [[VAL64:x[0-9]+]], [x0] +; CHECK: ldr x[[VAL64:[0-9]+]], [x0] ; [...] -; CHECK: and x0, [[VAL64]], #0xffffffff +; CHECK: mov w0, w[[VAL64]] ; CHECK: bl _memmove %val64 = load i64, ptr %addr @@ -42,9 +42,9 @@ false: define i64 @test_memset(ptr %addr, ptr %src, i1 %tst) minsize { ; CHECK-LABEL: test_memset: -; CHECK: ldr [[VAL64:x[0-9]+]], [x0] +; CHECK: ldr x[[VAL64:[0-9]+]], [x0] ; [...] -; CHECK: and x0, [[VAL64]], #0xffffffff +; CHECK: mov w0, w[[VAL64]] ; CHECK: bl _memset %val64 = load i64, ptr %addr diff --git a/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll b/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll index 2d6f0fbe308889..7b004b2f6d3102 100644 --- a/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll @@ -2,7 +2,7 @@ define void @pass_pointer(i64 %in) { ; CHECK-LABEL: pass_pointer: -; CHECK: and x0, x0, #0xffffffff +; CHECK: mov w0, w0 ; CHECK: bl _take_pointer %in32 = trunc i64 %in to i32 @@ -39,8 +39,8 @@ define void @caller_ptr_stack_slot(ptr %ptr) { define ptr @return_ptr(i64 %in, i64 %r) { ; CHECK-LABEL: return_ptr: -; CHECK: sdiv [[VAL64:x[0-9]+]], x0, x1 -; CHECK: and x0, [[VAL64]], #0xffffffff +; CHECK: sdiv x[[VAL64:[0-9]+]], x0, x1 +; CHECK: mov w0, w[[VAL64]] %sum = sdiv i64 %in, %r %sum32 = trunc i64 %sum to i32 diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll index 716fdd6eac15c6..c63edf0ceeea37 100644 --- a/llvm/test/CodeGen/AArch64/arm64_32.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32.ll @@ -598,7 +598,7 @@ define void @test_asm_memory(ptr %base.addr) { define void @test_unsafe_asm_memory(i64 %val) { ; CHECK-LABEL: test_unsafe_asm_memory: -; CHECK: and x[[ADDR:[0-9]+]], x0, #0xffffffff +; CHECK: mov w[[ADDR:[0-9]+]], w0 ; CHECK: str wzr, [x[[ADDR]]] %addr_int = trunc i64 %val to i32 %addr = inttoptr i32 %addr_int to ptr @@ -615,7 +615,8 @@ define [9 x ptr] @test_demoted_return(ptr %in) { define ptr @test_inttoptr(i64 %in) { ; CHECK-LABEL: test_inttoptr: -; CHECK: and x0, x0, #0xffffffff +; CHECK-OPT: mov w0, w0 +; CHECK-FAST: and x0, x0, #0xffffffff %res = inttoptr i64 %in to ptr ret ptr %res } @@ -732,7 +733,7 @@ define ptr @test_gep_nonpow2(ptr %a0, i32 %a1) { define void @test_memset(i64 %in, i8 %value) { ; CHECK-LABEL: test_memset: ; CHECK-DAG: lsr x2, x0, #32 -; CHECK-DAG: and x0, x0, #0xffffffff +; CHECK-DAG: mov w0, w0 ; CHECK: b _memset %ptr.i32 = trunc i64 %in to i32 @@ -746,7 +747,7 @@ define void @test_memset(i64 %in, i8 %value) { define void @test_bzero(i64 %in) { ; CHECK-LABEL: test_bzero: ; CHECK-DAG: lsr x1, x0, #32 -; CHECK-DAG: and x0, x0, #0xffffffff +; CHECK-DAG: mov w0, w0 ; CHECK: b _bzero %ptr.i32 = trunc i64 %in to i32 diff --git a/llvm/test/CodeGen/AArch64/bitfield.ll b/llvm/test/CodeGen/AArch64/bitfield.ll index 1dfa4a8e120018..6e18924ea19eec 100644 --- a/llvm/test/CodeGen/AArch64/bitfield.ll +++ b/llvm/test/CodeGen/AArch64/bitfield.ll @@ -173,7 +173,7 @@ define dso_local void @test_zext_inreg_64(i64 %in) { %trunc_i32 = trunc i64 %in to i32 %zext_i32 = zext i32 %trunc_i32 to i64 store volatile i64 %zext_i32, ptr @var64 -; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffffffff +; CHECK: mov {{w[0-9]+}}, {{w[0-9]+}} ret void } diff --git a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll index cc6bd766eed78c..1cc194e77b94b1 100644 --- a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll +++ b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll @@ -6,7 +6,7 @@ target triple = "arm64" define i1 @test_EQ_IllEbT(i64 %a, i64 %b) { ; CHECK-LABEL: test_EQ_IllEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmn x1, x0 +; CHECK-NEXT: cmn x0, x1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -72,7 +72,7 @@ entry: define i1 @test_EQ_IiiEbT(i32 %a, i32 %b) { ; CHECK-LABEL: test_EQ_IiiEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmn w1, w0 +; CHECK-NEXT: cmn w0, w1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -137,8 +137,8 @@ entry: define i1 @test_EQ_IssEbT(i16 %a, i16 %b) { ; CHECK-LABEL: test_EQ_IssEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w1 -; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: cmn w8, w1, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -152,8 +152,8 @@ entry: define i1 @test_EQ_IscEbT(i16 %a, i8 %b) { ; CHECK-LABEL: test_EQ_IscEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: cmn w8, w1, uxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -194,8 +194,8 @@ entry: define i1 @test_EQ_IcsEbT(i8 %a, i16 %b) { ; CHECK-LABEL: test_EQ_IcsEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w1 -; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmn w8, w1, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -209,8 +209,8 @@ entry: define i1 @test_EQ_IccEbT(i8 %a, i8 %b) { ; CHECK-LABEL: test_EQ_IccEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmn w8, w1, uxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: @@ -224,7 +224,7 @@ entry: define i1 @test_NE_IllEbT(i64 %a, i64 %b) { ; CHECK-LABEL: test_NE_IllEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmn x1, x0 +; CHECK-NEXT: cmn x0, x1 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: @@ -290,7 +290,7 @@ entry: define i1 @test_NE_IiiEbT(i32 %a, i32 %b) { ; CHECK-LABEL: test_NE_IiiEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmn w1, w0 +; CHECK-NEXT: cmn w0, w1 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: @@ -355,8 +355,8 @@ entry: define i1 @test_NE_IssEbT(i16 %a, i16 %b) { ; CHECK-LABEL: test_NE_IssEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w1 -; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: cmn w8, w1, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: @@ -370,8 +370,8 @@ entry: define i1 @test_NE_IscEbT(i16 %a, i8 %b) { ; CHECK-LABEL: test_NE_IscEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: cmn w8, w0, sxth +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: cmn w8, w1, uxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: @@ -412,8 +412,8 @@ entry: define i1 @test_NE_IcsEbT(i8 %a, i16 %b) { ; CHECK-LABEL: test_NE_IcsEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w1 -; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmn w8, w1, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: @@ -427,8 +427,8 @@ entry: define i1 @test_NE_IccEbT(i8 %a, i8 %b) { ; CHECK-LABEL: test_NE_IccEbT: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: cmn w8, w0, uxtb +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmn w8, w1, uxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index bd48c32566fc94..f6eeeef4faf7ed 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -1,20 +1,57 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) { -; CHECK-LABEL: concat1: -; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: concat1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov s2, v1.s[1] +; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v2.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> ret <4 x i8> %v4i8 } define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) { -; CHECK-LABEL: concat2: -; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: concat2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov h2, v1.h[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h3, v0.h[1] +; CHECK-GI-NEXT: mov h4, v1.h[2] +; CHECK-GI-NEXT: mov h5, v1.h[3] +; CHECK-GI-NEXT: mov h6, v0.h[3] +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov v0.h[1], v3.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v1.h[3], v5.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v6.h[0] +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %v8i8 = shufflevector <4 x i8> %A, <4 x i8> %B, <8 x i32> ret <8 x i8> %v8i8 } @@ -31,10 +68,25 @@ define <16 x i8> @concat3(<8 x i8> %A, <8 x i8> %B) { } define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) { -; CHECK-LABEL: concat4: -; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: concat4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov s2, v1.s[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s3, v0.s[1] +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v0.s[1], v3.s[0] +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %v4i16 = shufflevector <2 x i16> %A, <2 x i16> %B, <4 x i32> ret <4 x i16> %v4i16 } @@ -86,10 +138,18 @@ define <8 x i32> @concat8(ptr %A, ptr %B) { } define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) { -; CHECK-LABEL: concat9: -; CHECK: // %bb.0: -; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: concat9: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat9: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %v4half= shufflevector <2 x half> %A, <2 x half> %B, <4 x i32> ret <4 x half> %v4half } @@ -112,3 +172,179 @@ define <16 x half> @concat11(<8 x half> %A, <8 x half> %B) { %v16half= shufflevector <8 x half> %A, <8 x half> %B, <16 x i32> ret <16 x half> %v16half } + +define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) { +; CHECK-SD-LABEL: concat_v8s16_v2s16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v8s16_v2s16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v0.4s, w8 +; CHECK-GI-NEXT: ldr h1, [x0] +; CHECK-GI-NEXT: ldr h2, [x0, #2] +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: xtn v2.4h, v0.4s +; CHECK-GI-NEXT: xtn v0.4h, v1.4s +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %a = load <2 x i16>, ptr %ptr + %b = shufflevector <2 x i16> %a, <2 x i16> %a, <8 x i32> + ret <8 x i16> %b +} + +define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { +; CHECK-SD-LABEL: concat_v16s8_v4s8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v16s8_v4s8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v0.8h, w8 +; CHECK-GI-NEXT: xtn v1.8b, v0.8h +; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] +; CHECK-GI-NEXT: ret + %a = load <4 x i8>, ptr %ptr + %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> + ret <16 x i8> %b +} + +define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) { +; CHECK-SD-LABEL: concat_v16s8_v4s8_load: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ld1 { v0.s }[1], [x1] +; CHECK-SD-NEXT: ld1 { v0.s }[2], [x2] +; CHECK-SD-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v16s8_v4s8_load: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: ldr s1, [x1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: ldr s1, [x2] +; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] +; CHECK-GI-NEXT: ldr s1, [x3] +; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] +; CHECK-GI-NEXT: ret + %A = load <4 x i8>, ptr %ptrA + %B = load <4 x i8>, ptr %ptrB + %C = load <4 x i8>, ptr %ptrC + %D = load <4 x i8>, ptr %ptrD + %b = shufflevector <4 x i8> %A, <4 x i8> %B, <16 x i32> + %c = shufflevector <4 x i8> %C, <4 x i8> %D, <16 x i32> + %d = shufflevector <16 x i8> %b, <16 x i8> %c, <16 x i32> + ret <16 x i8> %d +} + + +define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <4 x i8> %D) { +; CHECK-SD-LABEL: concat_v16s8_v4s8_reg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v2.d[1], v3.d[0] +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v16s8_v4s8_reg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov h4, v1.h[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h5, v0.h[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: mov h6, v1.h[2] +; CHECK-GI-NEXT: mov h7, v1.h[3] +; CHECK-GI-NEXT: mov h16, v2.h[1] +; CHECK-GI-NEXT: mov h17, v0.h[3] +; CHECK-GI-NEXT: mov h18, v2.h[3] +; CHECK-GI-NEXT: mov v1.h[1], v4.h[0] +; CHECK-GI-NEXT: mov h4, v0.h[2] +; CHECK-GI-NEXT: mov v0.h[1], v5.h[0] +; CHECK-GI-NEXT: mov h5, v2.h[2] +; CHECK-GI-NEXT: mov v2.h[1], v16.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v6.h[0] +; CHECK-GI-NEXT: mov h6, v3.h[1] +; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] +; CHECK-GI-NEXT: mov h4, v3.h[2] +; CHECK-GI-NEXT: mov h5, v3.h[3] +; CHECK-GI-NEXT: mov v1.h[3], v7.h[0] +; CHECK-GI-NEXT: mov v3.h[1], v6.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v17.h[0] +; CHECK-GI-NEXT: mov v2.h[3], v18.h[0] +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: xtn v2.8b, v2.8h +; CHECK-GI-NEXT: mov v3.h[3], v5.h[0] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: xtn v1.8b, v3.8h +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %b = shufflevector <4 x i8> %A, <4 x i8> %B, <16 x i32> + %c = shufflevector <4 x i8> %C, <4 x i8> %D, <16 x i32> + %d = shufflevector <16 x i8> %b, <16 x i8> %c, <16 x i32> + ret <16 x i8> %d +} + +define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %C, <2 x i16> %D) { +; CHECK-SD-LABEL: concat_v8s16_v2s16_reg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v8s16_v2s16_reg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov s4, v1.s[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s5, v0.s[1] +; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: mov v1.s[1], v4.s[0] +; CHECK-GI-NEXT: mov s4, v2.s[1] +; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: mov v2.s[1], v4.s[0] +; CHECK-GI-NEXT: mov s4, v3.s[1] +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: xtn v2.4h, v2.4s +; CHECK-GI-NEXT: mov v3.s[1], v4.s[0] +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: xtn v1.4h, v3.4s +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> + %c = shufflevector <2 x i16> %C, <2 x i16> %D, <8 x i32> + %d = shufflevector <8 x i16> %b, <8 x i16> %c, <8 x i32> + ret <8 x i16> %d +} diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index 2e992964f59862..2ea7e0f3c44a9a 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -1,12 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { -; CHECK-LABEL: interleave2_v4f16: -; CHECK: // %bb.0: -; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: interleave2_v4f16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: interleave2_v4f16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v2.4s, w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: xtn v2.4h, v2.4s +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) ret <4 x half> %retval } diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index ac26ccc44128ff..7a4c5cee27b805 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -1,225 +1,228 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16 -; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16 -; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 - -; CHECK-GI: warning: Instruction selection used fallback path for stofp_i128_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i128_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i64_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i64_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i32_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i32_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i16_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i16_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i8_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i8_f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i128_f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i128_f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i128_f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i128_f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_i128_f16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_i128_f16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i128_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i128_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i128_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i128_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i64_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i64_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i64_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i64_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i128_v2f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i128_v2f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i128_v3f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i128_v3f64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i32_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i32_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i32_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i32_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i16_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i16_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i16_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i16_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i8_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i8_v2f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i8_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i8_v3f128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i128_v2f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i128_v2f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i128_v3f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i128_v3f32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v2i128_v2f16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v2i128_v2f16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for stofp_v3i128_v3f16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for utofp_v3i128_v3f16 +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16 +; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 define fp128 @stofp_i128_f128(i128 %a) { -; CHECK-LABEL: stofp_i128_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i128_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i128_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floattitf entry: %c = sitofp i128 %a to fp128 ret fp128 %c } define fp128 @utofp_i128_f128(i128 %a) { -; CHECK-LABEL: utofp_i128_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i128_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i128_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatuntitf entry: %c = uitofp i128 %a to fp128 ret fp128 %c } define fp128 @stofp_i64_f128(i64 %a) { -; CHECK-LABEL: stofp_i64_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i64_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i64_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatditf entry: %c = sitofp i64 %a to fp128 ret fp128 %c } define fp128 @utofp_i64_f128(i64 %a) { -; CHECK-LABEL: utofp_i64_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i64_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i64_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatunditf entry: %c = uitofp i64 %a to fp128 ret fp128 %c } define fp128 @stofp_i32_f128(i32 %a) { -; CHECK-LABEL: stofp_i32_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i32_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i32_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatsitf entry: %c = sitofp i32 %a to fp128 ret fp128 %c } define fp128 @utofp_i32_f128(i32 %a) { -; CHECK-LABEL: utofp_i32_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i32_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i32_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatunsitf entry: %c = uitofp i32 %a to fp128 ret fp128 %c } define fp128 @stofp_i16_f128(i16 %a) { -; CHECK-LABEL: stofp_i16_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: sxth w0, w0 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i16_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: sxth w0, w0 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i16_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w0, w0 +; CHECK-GI-NEXT: b __floatsitf entry: %c = sitofp i16 %a to fp128 ret fp128 %c } define fp128 @utofp_i16_f128(i16 %a) { -; CHECK-LABEL: utofp_i16_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: and w0, w0, #0xffff -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i16_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: and w0, w0, #0xffff +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i16_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w0, w0, #0xffff +; CHECK-GI-NEXT: b __floatunsitf entry: %c = uitofp i16 %a to fp128 ret fp128 %c } define fp128 @stofp_i8_f128(i8 %a) { -; CHECK-LABEL: stofp_i8_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: sxtb w0, w0 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i8_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: sxtb w0, w0 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i8_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w0, w0 +; CHECK-GI-NEXT: b __floatsitf entry: %c = sitofp i8 %a to fp128 ret fp128 %c } define fp128 @utofp_i8_f128(i8 %a) { -; CHECK-LABEL: utofp_i8_f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: and w0, w0, #0xff -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i8_f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: and w0, w0, #0xff +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i8_f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w0, w0, #0xff +; CHECK-GI-NEXT: b __floatunsitf entry: %c = uitofp i8 %a to fp128 ret fp128 %c } define double @stofp_i128_f64(i128 %a) { -; CHECK-LABEL: stofp_i128_f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i128_f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i128_f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floattidf entry: %c = sitofp i128 %a to double ret double %c } define double @utofp_i128_f64(i128 %a) { -; CHECK-LABEL: utofp_i128_f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i128_f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i128_f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatuntidf entry: %c = uitofp i128 %a to double ret double %c @@ -310,28 +313,36 @@ entry: } define float @stofp_i128_f32(i128 %a) { -; CHECK-LABEL: stofp_i128_f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_i128_f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_i128_f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floattisf entry: %c = sitofp i128 %a to float ret float %c } define float @utofp_i128_f32(i128 %a) { -; CHECK-LABEL: utofp_i128_f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_i128_f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_i128_f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: b __floatuntisf entry: %c = uitofp i128 %a to float ret float %c @@ -453,12 +464,7 @@ define half @stofp_i128_f16(i128 %a) { ; ; CHECK-GI-FP16-LABEL: stofp_i128_f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-FP16-NEXT: .cfi_offset w30, -16 -; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-FP16-NEXT: b __floattihf entry: %c = sitofp i128 %a to half ret half %c @@ -496,12 +502,7 @@ define half @utofp_i128_f16(i128 %a) { ; ; CHECK-GI-FP16-LABEL: utofp_i128_f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-FP16-NEXT: .cfi_offset w30, -16 -; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: ret +; CHECK-GI-FP16-NEXT: b __floatuntihf entry: %c = uitofp i128 %a to half ret half %c @@ -740,390 +741,720 @@ entry: } define <2 x fp128> @stofp_v2i128_v2f128(<2 x i128> %a) { -; CHECK-LABEL: stofp_v2i128_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x3 -; CHECK-NEXT: mov x20, x2 -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i128_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i128_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floattitf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattitf +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i128> %a to <2 x fp128> ret <2 x fp128> %c } define <2 x fp128> @utofp_v2i128_v2f128(<2 x i128> %a) { -; CHECK-LABEL: utofp_v2i128_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x3 -; CHECK-NEXT: mov x20, x2 -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i128_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i128_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floatuntitf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntitf +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i128> %a to <2 x fp128> ret <2 x fp128> %c } define <3 x fp128> @stofp_v3i128_v3f128(<3 x i128> %a) { -; CHECK-LABEL: stofp_v3i128_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattitf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i128_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #80 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattitf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #80 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i128_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floattitf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattitf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #80 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x fp128> ret <3 x fp128> %c } define <3 x fp128> @utofp_v3i128_v3f128(<3 x i128> %a) { -; CHECK-LABEL: utofp_v3i128_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntitf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #80 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i128_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #80 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntitf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #80 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i128_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floatuntitf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntitf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #80 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x fp128> ret <3 x fp128> %c } define <2 x fp128> @stofp_v2i64_v2f128(<2 x i64> %a) { -; CHECK-LABEL: stofp_v2i64_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x0, v0.d[1] -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i64_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov x0, v0.d[1] +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i64_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #32 +; CHECK-GI-NEXT: str d8, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset b8, -16 +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: mov d8, v0.d[1] +; CHECK-GI-NEXT: bl __floatditf +; CHECK-GI-NEXT: fmov x0, d8 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatditf +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #32 +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i64> %a to <2 x fp128> ret <2 x fp128> %c } define <2 x fp128> @utofp_v2i64_v2f128(<2 x i64> %a) { -; CHECK-LABEL: utofp_v2i64_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x0, v0.d[1] -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i64_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov x0, v0.d[1] +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i64_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #32 +; CHECK-GI-NEXT: str d8, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset b8, -16 +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: mov d8, v0.d[1] +; CHECK-GI-NEXT: bl __floatunditf +; CHECK-GI-NEXT: fmov x0, d8 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunditf +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #32 +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i64> %a to <2 x fp128> ret <2 x fp128> %c } define <3 x fp128> @stofp_v3i64_v3f128(<3 x i64> %a) { -; CHECK-LABEL: stofp_v3i64_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: bl __floatditf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i64_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: str q2, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: bl __floatditf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i64_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov d8, d1 +; CHECK-GI-NEXT: fmov d9, d2 +; CHECK-GI-NEXT: bl __floatditf +; CHECK-GI-NEXT: fmov x0, d8 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatditf +; CHECK-GI-NEXT: fmov x0, d9 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatditf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i64> %a to <3 x fp128> ret <3 x fp128> %c } -define <3 x fp128> @utofp_v3i64_v3f128(<3 x i64> %a) { -; CHECK-LABEL: utofp_v3i64_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: bl __floatunditf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +define <3 x fp128> @utofp_v3i64_v2f128(<3 x i64> %a) { +; CHECK-SD-LABEL: utofp_v3i64_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: str q2, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: bl __floatunditf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i64_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov d8, d1 +; CHECK-GI-NEXT: fmov d9, d2 +; CHECK-GI-NEXT: bl __floatunditf +; CHECK-GI-NEXT: fmov x0, d8 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunditf +; CHECK-GI-NEXT: fmov x0, d9 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunditf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i64> %a to <3 x fp128> ret <3 x fp128> %c } define <2 x double> @stofp_v2i128_v2f64(<2 x i128> %a) { -; CHECK-LABEL: stofp_v2i128_v2f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i128_v2f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x1 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i128_v2f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floattidf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattidf +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i128> %a to <2 x double> ret <2 x double> %c } define <2 x double> @utofp_v2i128_v2f64(<2 x i128> %a) { -; CHECK-LABEL: utofp_v2i128_v2f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i128_v2f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x1 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i128_v2f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floatuntidf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntidf +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i128> %a to <2 x double> ret <2 x double> %c } define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) { -; CHECK-LABEL: stofp_v3i128_v3f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: .cfi_offset b8, -56 -; CHECK-NEXT: .cfi_offset b9, -64 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: fmov d8, d0 -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: fmov d9, d0 -; CHECK-NEXT: bl __floattidf -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fmov d2, d0 -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, d8 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: fmov d1, d9 -; CHECK-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i128_v3f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: .cfi_offset b8, -56 +; CHECK-SD-NEXT: .cfi_offset b9, -64 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: fmov d8, d0 +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: fmov d9, d0 +; CHECK-SD-NEXT: bl __floattidf +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d2, d0 +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d0, d8 +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: fmov d1, d9 +; CHECK-SD-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i128_v3f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: .cfi_offset b8, -56 +; CHECK-GI-NEXT: .cfi_offset b9, -64 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floattidf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: bl __floattidf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: fmov d9, d0 +; CHECK-GI-NEXT: bl __floattidf +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov d2, d0 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: fmov d1, d9 +; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x double> ret <3 x double> %c } define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) { -; CHECK-LABEL: utofp_v3i128_v3f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: .cfi_offset b8, -56 -; CHECK-NEXT: .cfi_offset b9, -64 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: fmov d8, d0 -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: fmov d9, d0 -; CHECK-NEXT: bl __floatuntidf -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fmov d2, d0 -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, d8 -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: fmov d1, d9 -; CHECK-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i128_v3f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: .cfi_offset b8, -56 +; CHECK-SD-NEXT: .cfi_offset b9, -64 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: fmov d8, d0 +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: fmov d9, d0 +; CHECK-SD-NEXT: bl __floatuntidf +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d2, d0 +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d0, d8 +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: fmov d1, d9 +; CHECK-SD-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i128_v3f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp d9, d8, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: .cfi_offset b8, -56 +; CHECK-GI-NEXT: .cfi_offset b9, -64 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floatuntidf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: fmov d8, d0 +; CHECK-GI-NEXT: bl __floatuntidf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: fmov d9, d0 +; CHECK-GI-NEXT: bl __floatuntidf +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov d2, d0 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: fmov d1, d9 +; CHECK-GI-NEXT: ldp d9, d8, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x double> ret <3 x double> %c @@ -1438,108 +1769,204 @@ entry: } define <2 x fp128> @stofp_v2i32_v2f128(<2 x i32> %a) { -; CHECK-LABEL: stofp_v2i32_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov w0, v1.s[1] -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i32_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: mov w0, v1.s[1] +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i32_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #32 +; CHECK-GI-NEXT: str d8, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset b8, -16 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: mov s8, v0.s[1] +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: fmov w0, s8 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #32 +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x fp128> ret <2 x fp128> %c } define <2 x fp128> @utofp_v2i32_v2f128(<2 x i32> %a) { -; CHECK-LABEL: utofp_v2i32_v2f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov w0, v1.s[1] -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i32_v2f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: mov w0, v1.s[1] +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i32_v2f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #32 +; CHECK-GI-NEXT: str d8, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #24] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset b8, -16 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: mov s8, v0.s[1] +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: fmov w0, s8 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldr d8, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #32 +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x fp128> ret <2 x fp128> %c } define <3 x fp128> @stofp_v3i32_v3f128(<3 x i32> %a) { -; CHECK-LABEL: stofp_v3i32_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, v0.s[1] -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i32_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: fmov w0, s1 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w0, v0.s[1] +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i32_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: mov s8, v0.s[1] +; CHECK-GI-NEXT: mov s9, v0.s[2] +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: fmov w0, s8 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: fmov w0, s9 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x fp128> ret <3 x fp128> %c } define <3 x fp128> @utofp_v3i32_v3f128(<3 x i32> %a) { -; CHECK-LABEL: utofp_v3i32_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, v0.s[1] -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i32_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: fmov w0, s1 +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w0, v0.s[1] +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i32_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: mov s8, v0.s[1] +; CHECK-GI-NEXT: mov s9, v0.s[2] +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: fmov w0, s8 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: fmov w0, s9 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x fp128> ret <3 x fp128> %c @@ -2703,63 +3130,115 @@ entry: ret <2 x fp128> %c } -define <3 x fp128> @stofp_v3i8_v3f128(<3 x i8> %a) { -; CHECK-LABEL: stofp_v3i8_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: sxtb w0, w0 -; CHECK-NEXT: mov w19, w2 -; CHECK-NEXT: mov w20, w1 -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: sxtb w0, w20 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: sxtb w0, w19 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatsitf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +define <3 x fp128> @stofp_v2i8_v3f128(<3 x i8> %a) { +; CHECK-SD-LABEL: stofp_v2i8_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: sxtb w0, w0 +; CHECK-SD-NEXT: mov w19, w2 +; CHECK-SD-NEXT: mov w20, w1 +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: sxtb w0, w20 +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: sxtb w0, w19 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatsitf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i8_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: sxtb w0, w0 +; CHECK-GI-NEXT: mov w19, w1 +; CHECK-GI-NEXT: mov w20, w2 +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: sxtb w0, w19 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: sxtb w0, w20 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatsitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i8> %a to <3 x fp128> ret <3 x fp128> %c } -define <3 x fp128> @utofp_v3i8_v3f128(<3 x i8> %a) { -; CHECK-LABEL: utofp_v3i8_v3f128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: and w0, w0, #0xff -; CHECK-NEXT: mov w19, w2 -; CHECK-NEXT: mov w20, w1 -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: and w0, w20, #0xff -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: and w0, w19, #0xff -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatunsitf -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +define <3 x fp128> @utofp_v2i8_v3f128(<3 x i8> %a) { +; CHECK-SD-LABEL: utofp_v2i8_v3f128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: and w0, w0, #0xff +; CHECK-SD-NEXT: mov w19, w2 +; CHECK-SD-NEXT: mov w20, w1 +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: and w0, w20, #0xff +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: and w0, w19, #0xff +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatunsitf +; CHECK-SD-NEXT: mov v2.16b, v0.16b +; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i8_v3f128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: and w0, w0, #0xff +; CHECK-GI-NEXT: mov w19, w1 +; CHECK-GI-NEXT: mov w20, w2 +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: and w0, w19, #0xff +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: and w0, w20, #0xff +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatunsitf +; CHECK-GI-NEXT: mov v2.16b, v0.16b +; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i8> %a to <3 x fp128> ret <3 x fp128> %c @@ -3674,158 +4153,286 @@ entry: } define <2 x float> @stofp_v2i128_v2f32(<2 x i128> %a) { -; CHECK-LABEL: stofp_v2i128_v2f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i128_v2f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x1 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i128_v2f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floattisf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattisf +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[1], v0.s[0] +; CHECK-GI-NEXT: fmov d0, d1 +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i128> %a to <2 x float> ret <2 x float> %c } define <2 x float> @utofp_v2i128_v2f32(<2 x i128> %a) { -; CHECK-LABEL: utofp_v2i128_v2f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: mov x19, x1 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i128_v2f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #48 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov x19, x1 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: add sp, sp, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i128_v2f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: bl __floatuntisf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntisf +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[1], v0.s[0] +; CHECK-GI-NEXT: fmov d0, d1 +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i128> %a to <2 x float> ret <2 x float> %c } define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) { -; CHECK-LABEL: stofp_v3i128_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: mov x21, x1 -; CHECK-NEXT: mov x22, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floattisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v1.s[2], v0.s[0] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v3i128_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x21, x1 +; CHECK-SD-NEXT: mov x22, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floattisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v1.s[2], v0.s[0] +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v3i128_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floattisf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattisf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floattisf +; CHECK-GI-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #80 +; CHECK-GI-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x float> ret <3 x float> %c } define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) { -; CHECK-LABEL: utofp_v3i128_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: mov x21, x1 -; CHECK-NEXT: mov x22, x0 -; CHECK-NEXT: mov x0, x2 -; CHECK-NEXT: mov x1, x3 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __floatuntisf -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mov v1.s[2], v0.s[0] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v3i128_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x21, x1 +; CHECK-SD-NEXT: mov x22, x0 +; CHECK-SD-NEXT: mov x0, x2 +; CHECK-SD-NEXT: mov x1, x3 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __floatuntisf +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v1.s[2], v0.s[0] +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v3i128_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #80 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 80 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __floatuntisf +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntisf +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __floatuntisf +; CHECK-GI-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[2], v0.s[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #80 +; CHECK-GI-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x float> ret <3 x float> %c @@ -5103,18 +5710,16 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NOFP16-NEXT: mov x19, x1 -; CHECK-GI-NOFP16-NEXT: mov x20, x0 -; CHECK-GI-NOFP16-NEXT: mov x0, x2 -; CHECK-GI-NOFP16-NEXT: mov x1, x3 +; CHECK-GI-NOFP16-NEXT: mov x19, x2 +; CHECK-GI-NOFP16-NEXT: mov x20, x3 ; CHECK-GI-NOFP16-NEXT: bl __floattisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: mov x0, x20 -; CHECK-GI-NOFP16-NEXT: mov x1, x19 +; CHECK-GI-NOFP16-NEXT: mov x0, x19 +; CHECK-GI-NOFP16-NEXT: mov x1, x20 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 +; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] @@ -5131,13 +5736,11 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -32 -; CHECK-GI-FP16-NEXT: mov x19, x1 -; CHECK-GI-FP16-NEXT: mov x20, x0 -; CHECK-GI-FP16-NEXT: mov x0, x2 -; CHECK-GI-FP16-NEXT: mov x1, x3 +; CHECK-GI-FP16-NEXT: mov x19, x2 +; CHECK-GI-FP16-NEXT: mov x20, x3 ; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: mov x0, x20 -; CHECK-GI-FP16-NEXT: mov x1, x19 +; CHECK-GI-FP16-NEXT: mov x0, x19 +; CHECK-GI-FP16-NEXT: mov x1, x20 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floattihf @@ -5145,8 +5748,8 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-FP16-NEXT: fmov d0, d1 ; CHECK-GI-FP16-NEXT: add sp, sp, #48 ; CHECK-GI-FP16-NEXT: ret entry: @@ -5220,18 +5823,16 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NOFP16-NEXT: mov x19, x1 -; CHECK-GI-NOFP16-NEXT: mov x20, x0 -; CHECK-GI-NOFP16-NEXT: mov x0, x2 -; CHECK-GI-NOFP16-NEXT: mov x1, x3 +; CHECK-GI-NOFP16-NEXT: mov x19, x2 +; CHECK-GI-NOFP16-NEXT: mov x20, x3 ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: mov x0, x20 -; CHECK-GI-NOFP16-NEXT: mov x1, x19 +; CHECK-GI-NOFP16-NEXT: mov x0, x19 +; CHECK-GI-NOFP16-NEXT: mov x1, x20 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 +; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] @@ -5248,13 +5849,11 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -32 -; CHECK-GI-FP16-NEXT: mov x19, x1 -; CHECK-GI-FP16-NEXT: mov x20, x0 -; CHECK-GI-FP16-NEXT: mov x0, x2 -; CHECK-GI-FP16-NEXT: mov x1, x3 +; CHECK-GI-FP16-NEXT: mov x19, x2 +; CHECK-GI-FP16-NEXT: mov x20, x3 ; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: mov x0, x20 -; CHECK-GI-FP16-NEXT: mov x1, x19 +; CHECK-GI-FP16-NEXT: mov x0, x19 +; CHECK-GI-FP16-NEXT: mov x1, x20 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floatuntihf @@ -5262,8 +5861,8 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) { ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-FP16-NEXT: mov v1.h[1], v0.h[0] +; CHECK-GI-FP16-NEXT: fmov d0, d1 ; CHECK-GI-FP16-NEXT: add sp, sp, #48 ; CHECK-GI-FP16-NEXT: ret entry: @@ -5356,84 +5955,80 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) { ; ; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: sub sp, sp, #64 -; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80 +; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48 -; CHECK-GI-NOFP16-NEXT: mov x21, x1 -; CHECK-GI-NOFP16-NEXT: mov x22, x0 -; CHECK-GI-NOFP16-NEXT: mov x0, x2 -; CHECK-GI-NOFP16-NEXT: mov x1, x3 -; CHECK-GI-NOFP16-NEXT: mov x19, x5 -; CHECK-GI-NOFP16-NEXT: mov x20, x4 +; CHECK-GI-NOFP16-NEXT: mov x19, x2 +; CHECK-GI-NOFP16-NEXT: mov x20, x3 +; CHECK-GI-NOFP16-NEXT: mov x21, x4 +; CHECK-GI-NOFP16-NEXT: mov x22, x5 ; CHECK-GI-NOFP16-NEXT: bl __floattisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: mov x0, x22 -; CHECK-GI-NOFP16-NEXT: mov x1, x21 -; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: mov x0, x19 +; CHECK-GI-NOFP16-NEXT: mov x1, x20 +; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floattisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov x0, x20 -; CHECK-GI-NOFP16-NEXT: mov x1, x19 -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov x0, x21 +; CHECK-GI-NOFP16-NEXT: mov x1, x22 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floattisf -; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 -; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NOFP16-NEXT: add sp, sp, #64 +; CHECK-GI-NOFP16-NEXT: add sp, sp, #80 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: sub sp, sp, #64 -; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-FP16-NEXT: sub sp, sp, #80 +; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48 -; CHECK-GI-FP16-NEXT: mov x21, x1 -; CHECK-GI-FP16-NEXT: mov x22, x0 -; CHECK-GI-FP16-NEXT: mov x0, x2 -; CHECK-GI-FP16-NEXT: mov x1, x3 -; CHECK-GI-FP16-NEXT: mov x19, x5 -; CHECK-GI-FP16-NEXT: mov x20, x4 +; CHECK-GI-FP16-NEXT: mov x19, x2 +; CHECK-GI-FP16-NEXT: mov x20, x3 +; CHECK-GI-FP16-NEXT: mov x21, x4 +; CHECK-GI-FP16-NEXT: mov x22, x5 ; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: mov x0, x22 -; CHECK-GI-FP16-NEXT: mov x1, x21 +; CHECK-GI-FP16-NEXT: mov x0, x19 +; CHECK-GI-FP16-NEXT: mov x1, x20 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov x0, x21 +; CHECK-GI-FP16-NEXT: mov x1, x22 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: mov x0, x20 -; CHECK-GI-FP16-NEXT: mov x1, x19 -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floattihf -; CHECK-GI-FP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-FP16-NEXT: fmov d0, d1 -; CHECK-GI-FP16-NEXT: add sp, sp, #64 +; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-FP16-NEXT: add sp, sp, #80 ; CHECK-GI-FP16-NEXT: ret entry: %c = sitofp <3 x i128> %a to <3 x half> @@ -5525,84 +6120,80 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) { ; ; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16: ; CHECK-GI-NOFP16: // %bb.0: // %entry -; CHECK-GI-NOFP16-NEXT: sub sp, sp, #64 -; CHECK-GI-NOFP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NOFP16-NEXT: sub sp, sp, #80 +; CHECK-GI-NOFP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-NOFP16-NEXT: .cfi_offset w30, -48 -; CHECK-GI-NOFP16-NEXT: mov x21, x1 -; CHECK-GI-NOFP16-NEXT: mov x22, x0 -; CHECK-GI-NOFP16-NEXT: mov x0, x2 -; CHECK-GI-NOFP16-NEXT: mov x1, x3 -; CHECK-GI-NOFP16-NEXT: mov x19, x5 -; CHECK-GI-NOFP16-NEXT: mov x20, x4 +; CHECK-GI-NOFP16-NEXT: mov x19, x2 +; CHECK-GI-NOFP16-NEXT: mov x20, x3 +; CHECK-GI-NOFP16-NEXT: mov x21, x4 +; CHECK-GI-NOFP16-NEXT: mov x22, x5 ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: mov x0, x22 -; CHECK-GI-NOFP16-NEXT: mov x1, x21 -; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NOFP16-NEXT: mov x0, x19 +; CHECK-GI-NOFP16-NEXT: mov x1, x20 +; CHECK-GI-NOFP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 -; CHECK-GI-NOFP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov x0, x20 -; CHECK-GI-NOFP16-NEXT: mov x1, x19 -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov x0, x21 +; CHECK-GI-NOFP16-NEXT: mov x1, x22 ; CHECK-GI-NOFP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NOFP16-NEXT: bl __floatuntisf -; CHECK-GI-NOFP16-NEXT: fcvt h1, s0 -; CHECK-GI-NOFP16-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NOFP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NOFP16-NEXT: add sp, sp, #64 +; CHECK-GI-NOFP16-NEXT: add sp, sp, #80 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: sub sp, sp, #64 -; CHECK-GI-FP16-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-FP16-NEXT: sub sp, sp, #80 +; CHECK-GI-FP16-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: .cfi_def_cfa_offset 80 ; CHECK-GI-FP16-NEXT: .cfi_offset w19, -8 ; CHECK-GI-FP16-NEXT: .cfi_offset w20, -16 ; CHECK-GI-FP16-NEXT: .cfi_offset w21, -24 ; CHECK-GI-FP16-NEXT: .cfi_offset w22, -32 ; CHECK-GI-FP16-NEXT: .cfi_offset w30, -48 -; CHECK-GI-FP16-NEXT: mov x21, x1 -; CHECK-GI-FP16-NEXT: mov x22, x0 -; CHECK-GI-FP16-NEXT: mov x0, x2 -; CHECK-GI-FP16-NEXT: mov x1, x3 -; CHECK-GI-FP16-NEXT: mov x19, x5 -; CHECK-GI-FP16-NEXT: mov x20, x4 +; CHECK-GI-FP16-NEXT: mov x19, x2 +; CHECK-GI-FP16-NEXT: mov x20, x3 +; CHECK-GI-FP16-NEXT: mov x21, x4 +; CHECK-GI-FP16-NEXT: mov x22, x5 ; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: mov x0, x22 -; CHECK-GI-FP16-NEXT: mov x1, x21 +; CHECK-GI-FP16-NEXT: mov x0, x19 +; CHECK-GI-FP16-NEXT: mov x1, x20 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-FP16-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov x0, x21 +; CHECK-GI-FP16-NEXT: mov x1, x22 ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: mov x0, x20 -; CHECK-GI-FP16-NEXT: mov x1, x19 -; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-FP16-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-FP16-NEXT: bl __floatuntihf -; CHECK-GI-FP16-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload ; CHECK-GI-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-GI-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-FP16-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-FP16-NEXT: mov v1.h[2], v0.h[0] -; CHECK-GI-FP16-NEXT: fmov d0, d1 -; CHECK-GI-FP16-NEXT: add sp, sp, #64 +; CHECK-GI-FP16-NEXT: mov v0.16b, v1.16b +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-FP16-NEXT: add sp, sp, #80 ; CHECK-GI-FP16-NEXT: ret entry: %c = uitofp <3 x i128> %a to <3 x half> diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll index e87d8f7874d62e..88bab4af95d64f 100644 --- a/llvm/test/CodeGen/AArch64/pr58431.ll +++ b/llvm/test/CodeGen/AArch64/pr58431.ll @@ -5,7 +5,7 @@ define i32 @f(i64 %0) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #10 // =0xa -; CHECK-NEXT: and x9, x0, #0xffffffff +; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: udiv x10, x9, x8 ; CHECK-NEXT: msub x0, x10, x8, x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-call.ll index 72e158fdf99168..9f211b6e1796e6 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-call.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-call.ll @@ -269,4 +269,136 @@ define i32 @test_tailcall_ib_arg_ind(ptr %arg0, i64 %arg1) #0 { ret i32 %tmp1 } +; Test direct calls + +define i32 @test_direct_call() #0 { +; DARWIN-LABEL: test_direct_call: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: bl _f +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: bl f +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i32 ptrauth(ptr @f, i32 0, i64 42)() [ "ptrauth"(i32 0, i64 42) ] + ret i32 %tmp0 +} + +define i32 @test_direct_tailcall(ptr %arg0) #0 { +; DARWIN-LABEL: test_direct_tailcall: +; DARWIN: b _f +; +; ELF-LABEL: test_direct_tailcall: +; ELF-NEXT: b f + %tmp0 = tail call i32 ptrauth(ptr @f, i32 0, i64 42)() [ "ptrauth"(i32 0, i64 42) ] + ret i32 %tmp0 +} + +define i32 @test_direct_call_mismatch() #0 { +; DARWIN-LABEL: test_direct_call_mismatch: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: adrp x16, _f@GOTPAGE +; DARWIN-NEXT: ldr x16, [x16, _f@GOTPAGEOFF] +; DARWIN-NEXT: mov x17, #42 +; DARWIN-NEXT: pacia x16, x17 +; DARWIN-NEXT: mov x8, x16 +; DARWIN-NEXT: mov x17, #42 +; DARWIN-NEXT: blrab x8, x17 +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call_mismatch: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: adrp x16, :got:f +; ELF-NEXT: ldr x16, [x16, :got_lo12:f] +; ELF-NEXT: mov x17, #42 +; ELF-NEXT: pacia x16, x17 +; ELF-NEXT: mov x8, x16 +; ELF-NEXT: mov x17, #42 +; ELF-NEXT: blrab x8, x17 +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i32 ptrauth(ptr @f, i32 0, i64 42)() [ "ptrauth"(i32 1, i64 42) ] + ret i32 %tmp0 +} + +define i32 @test_direct_call_addr() #0 { +; DARWIN-LABEL: test_direct_call_addr: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: bl _f +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call_addr: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: bl f +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i32 ptrauth(ptr @f, i32 1, i64 0, ptr @f.ref.ib.0.addr)() [ "ptrauth"(i32 1, i64 ptrtoint (ptr @f.ref.ib.0.addr to i64)) ] + ret i32 %tmp0 +} + +define i32 @test_direct_call_addr_blend() #0 { +; DARWIN-LABEL: test_direct_call_addr_blend: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: bl _f +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call_addr_blend: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: bl f +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f.ref.ib.42.addr to i64), i64 42) + %tmp1 = call i32 ptrauth(ptr @f, i32 1, i64 42, ptr @f.ref.ib.42.addr)() [ "ptrauth"(i32 1, i64 %tmp0) ] + ret i32 %tmp1 +} + +define i32 @test_direct_call_addr_gep_different_index_types() #0 { +; DARWIN-LABEL: test_direct_call_addr_gep_different_index_types: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: bl _f +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call_addr_gep_different_index_types: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: bl f +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i32 ptrauth(ptr @f, i32 1, i64 0, ptr getelementptr ({ ptr }, ptr @f_struct.ref.ib.0.addr, i64 0, i32 0))() [ "ptrauth"(i32 1, i64 ptrtoint (ptr getelementptr ({ ptr }, ptr @f_struct.ref.ib.0.addr, i32 0, i32 0) to i64)) ] + ret i32 %tmp0 +} + +define i32 @test_direct_call_addr_blend_gep_different_index_types() #0 { +; DARWIN-LABEL: test_direct_call_addr_blend_gep_different_index_types: +; DARWIN-NEXT: stp x29, x30, [sp, #-16]! +; DARWIN-NEXT: bl _f +; DARWIN-NEXT: ldp x29, x30, [sp], #16 +; DARWIN-NEXT: ret +; +; ELF-LABEL: test_direct_call_addr_blend_gep_different_index_types: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: bl f +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %tmp0 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr getelementptr ({ ptr }, ptr @f_struct.ref.ib.123.addr, i32 0, i32 0) to i64), i64 123) + %tmp1 = call i32 ptrauth(ptr @f, i32 1, i64 123, ptr getelementptr ({ ptr }, ptr @f_struct.ref.ib.123.addr, i64 0, i32 0))() [ "ptrauth"(i32 1, i64 %tmp0) ] + ret i32 %tmp1 +} + +@f.ref.ib.42.addr = external global ptr +@f.ref.ib.0.addr = external global ptr +@f_struct.ref.ib.0.addr = external global ptr +@f_struct.ref.ib.123.addr = external global ptr + +declare void @f() + +declare i64 @llvm.ptrauth.auth(i64, i32, i64) +declare i64 @llvm.ptrauth.blend(i64, i64) + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-invoke.ll b/llvm/test/CodeGen/AArch64/ptrauth-invoke.ll index fcd0ddb7883362..f6b3a88ca46779 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-invoke.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-invoke.ll @@ -230,9 +230,6 @@ continuebb: ; CHECK-NEXT: [[TT]]: -; ELF-LABEL: .L_ZTIPKc.DW.stub: -; ELF-NEXT: .xword _ZTIPKc - define void @test_invoke_ib_42_catch(ptr %fptr) #0 personality ptr @__gxx_personality_v0 { %tmp0 = call ptr @__cxa_allocate_exception(i64 8) store ptr getelementptr inbounds ([6 x i8], ptr @hello_str, i64 0, i64 0), ptr %tmp0, align 8 @@ -263,8 +260,208 @@ continuebb: unreachable } +; DARWIN-LABEL: _test_invoke_ia_0_direct: +; DARWIN-NEXT: [[FNBEGIN:L.*]]: +; DARWIN-NEXT: .cfi_startproc +; DARWIN-NEXT: .cfi_personality 155, ___gxx_personality_v0 +; DARWIN-NEXT: .cfi_lsda 16, [[EXCEPT:Lexception[0-9]+]] +; DARWIN-NEXT: ; %bb.0: +; DARWIN-NEXT: stp x20, x19, [sp, #-32]! +; DARWIN-NEXT: stp x29, x30, [sp, #16] +; DARWIN-NEXT: .cfi_def_cfa_offset 32 +; DARWIN-NEXT: .cfi_offset w30, -8 +; DARWIN-NEXT: .cfi_offset w29, -16 +; DARWIN-NEXT: .cfi_offset w19, -24 +; DARWIN-NEXT: .cfi_offset w20, -32 +; DARWIN-NEXT: [[PRECALL:L.*]]: +; DARWIN-NEXT: bl _baz + +; DARWIN-SDAG-NEXT: [[POSTCALL:L.*]]: +; DARWIN-SDAG-NEXT: ; %bb.1: +; DARWIN-SDAG-NEXT: mov x19, x0 + +; DARWIN-GISEL-NEXT: mov x19, x0 +; DARWIN-GISEL-NEXT: [[POSTCALL:L.*]]: + +; DARWIN-NEXT: [[CALLBB:L.*]]: +; DARWIN-NEXT: bl _foo +; DARWIN-NEXT: mov x0, x19 +; DARWIN-NEXT: ldp x29, x30, [sp, #16] +; DARWIN-NEXT: ldp x20, x19, [sp], #32 +; DARWIN-NEXT: ret +; DARWIN-NEXT: [[LPADBB:LBB[0-9_]+]]: +; DARWIN-NEXT: [[LPAD:L.*]]: +; DARWIN-NEXT: mov w19, #-1 +; DARWIN-NEXT: b [[CALLBB]] + +; ELF-LABEL: test_invoke_ia_0_direct: +; ELF-NEXT: [[FNBEGIN:.L.*]]: +; ELF-NEXT: .cfi_startproc +; ELF-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; ELF-NEXT: .cfi_lsda 28, [[EXCEPT:.Lexception[0-9]+]] +; ELF-NEXT: // %bb.0: +; ELF-NEXT: stp x30, x19, [sp, #-16]! +; ELF-NEXT: .cfi_def_cfa_offset 16 +; ELF-NEXT: .cfi_offset w19, -8 +; ELF-NEXT: .cfi_offset w30, -16 +; ELF-NEXT: [[PRECALL:.L.*]]: +; ELF-NEXT: bl baz + +; ELF-SDAG-NEXT: [[POSTCALL:.L.*]]: +; ELF-SDAG-NEXT: // %bb.1: +; ELF-SDAG-NEXT: mov w19, w0 + +; ELF-GISEL-NEXT: mov w19, w0 +; ELF-GISEL-NEXT: [[POSTCALL:.L.*]]: + +; ELF-NEXT: [[CALLBB:.L.*]]: +; ELF-NEXT: bl foo +; ELF-NEXT: mov w0, w19 +; ELF-NEXT: ldp x30, x19, [sp], #16 +; ELF-NEXT: ret +; ELF-NEXT: [[LPADBB:.LBB[0-9_]+]]: +; ELF-NEXT: [[LPAD:.L.*]]: +; ELF-NEXT: mov w19, #-1 +; ELF-NEXT: b [[CALLBB]] + +; CHECK-LABEL: GCC_except_table{{.*}}: +; CHECK-NEXT: [[EXCEPT]]: +; CHECK: .uleb128 [[POSTCALL]]-[[PRECALL]] {{.*}} Call between [[PRECALL]] and [[POSTCALL]] +; CHECK-NEXT: .uleb128 [[LPAD]]-[[FNBEGIN]] {{.*}} jumps to [[LPAD]] +; CHECK-NEXT: .byte 0 {{.*}} On action: cleanup + +define i32 @test_invoke_ia_0_direct() #0 personality ptr @__gxx_personality_v0 { + %tmp0 = invoke i32 ptrauth (ptr @baz, i32 0)() [ "ptrauth"(i32 0, i64 0) ] to label %continuebb + unwind label %unwindbb + +unwindbb: + %tmp1 = landingpad { ptr, i32 } cleanup + call void @foo() + ret i32 -1 + +continuebb: + call void @foo() + ret i32 %tmp0 +} + +; DARWIN-LABEL: _test_invoke_ib_2_direct_mismatch: +; DARWIN-NEXT: [[FNBEGIN:L.*]]: +; DARWIN-NEXT: .cfi_startproc +; DARWIN-NEXT: .cfi_personality 155, ___gxx_personality_v0 +; DARWIN-NEXT: .cfi_lsda 16, [[EXCEPT:Lexception[0-9]+]] +; DARWIN-NEXT: ; %bb.0: +; DARWIN-NEXT: stp x20, x19, [sp, #-32]! +; DARWIN-NEXT: stp x29, x30, [sp, #16] +; DARWIN-NEXT: .cfi_def_cfa_offset 32 +; DARWIN-NEXT: .cfi_offset w30, -8 +; DARWIN-NEXT: .cfi_offset w29, -16 +; DARWIN-NEXT: .cfi_offset w19, -24 +; DARWIN-NEXT: .cfi_offset w20, -32 + +; DARWIN-SDAG-NEXT: [[PRECALL:L.*]]: +; DARWIN-SDAG-NEXT: adrp x16, _baz@GOTPAGE +; DARWIN-SDAG-NEXT: ldr x16, [x16, _baz@GOTPAGEOFF] +; DARWIN-SDAG-NEXT: mov x17, #1234 +; DARWIN-SDAG-NEXT: pacia x16, x17 +; DARWIN-SDAG-NEXT: mov x8, x16 +; DARWIN-SDAG-NEXT: mov x17, #2 +; DARWIN-SDAG-NEXT: blrab x8, x17 +; DARWIN-SDAG-NEXT: [[POSTCALL:L.*]]: +; DARWIN-SDAG-NEXT: ; %bb.1: +; DARWIN-SDAG-NEXT: mov x19, x0 + +; DARWIN-GISEL-NEXT: adrp x16, _baz@GOTPAGE +; DARWIN-GISEL-NEXT: ldr x16, [x16, _baz@GOTPAGEOFF] +; DARWIN-GISEL-NEXT: mov x17, #1234 +; DARWIN-GISEL-NEXT: pacia x16, x17 +; DARWIN-GISEL-NEXT: mov x8, x16 +; DARWIN-GISEL-NEXT: [[PRECALL:L.*]]: +; DARWIN-GISEL-NEXT: mov x17, #2 +; DARWIN-GISEL-NEXT: blrab x8, x17 +; DARWIN-GISEL-NEXT: mov x19, x0 +; DARWIN-GISEL-NEXT: [[POSTCALL:L.*]]: + +; DARWIN-NEXT: [[CALLBB:L.*]]: +; DARWIN-NEXT: bl _foo +; DARWIN-NEXT: mov x0, x19 +; DARWIN-NEXT: ldp x29, x30, [sp, #16] +; DARWIN-NEXT: ldp x20, x19, [sp], #32 +; DARWIN-NEXT: ret +; DARWIN-NEXT: [[LPADBB:LBB[0-9_]+]]: +; DARWIN-NEXT: [[LPAD:L.*]]: +; DARWIN-NEXT: mov w19, #-1 +; DARWIN-NEXT: b [[CALLBB]] + +; ELF-LABEL: test_invoke_ib_2_direct_mismatch: +; ELF-NEXT: [[FNBEGIN:.L.*]]: +; ELF-NEXT: .cfi_startproc +; ELF-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; ELF-NEXT: .cfi_lsda 28, [[EXCEPT:.Lexception[0-9]+]] +; ELF-NEXT: // %bb.0: +; ELF-NEXT: stp x30, x19, [sp, #-16]! +; ELF-NEXT: .cfi_def_cfa_offset 16 +; ELF-NEXT: .cfi_offset w19, -8 +; ELF-NEXT: .cfi_offset w30, -16 + +; ELF-SDAG-NEXT: [[PRECALL:.L.*]]: +; ELF-SDAG-NEXT: adrp x16, :got:baz +; ELF-SDAG-NEXT: ldr x16, [x16, :got_lo12:baz] +; ELF-SDAG-NEXT: mov x17, #1234 +; ELF-SDAG-NEXT: pacia x16, x17 +; ELF-SDAG-NEXT: mov x8, x16 +; ELF-SDAG-NEXT: mov x17, #2 +; ELF-SDAG-NEXT: blrab x8, x17 +; ELF-SDAG-NEXT: [[POSTCALL:.L.*]]: +; ELF-SDAG-NEXT: // %bb.1: +; ELF-SDAG-NEXT: mov w19, w0 + +; ELF-GISEL-NEXT: adrp x16, :got:baz +; ELF-GISEL-NEXT: ldr x16, [x16, :got_lo12:baz] +; ELF-GISEL-NEXT: mov x17, #1234 +; ELF-GISEL-NEXT: pacia x16, x17 +; ELF-GISEL-NEXT: mov x8, x16 +; ELF-GISEL-NEXT: [[PRECALL:.L.*]]: +; ELF-GISEL-NEXT: mov x17, #2 +; ELF-GISEL-NEXT: blrab x8, x17 +; ELF-GISEL-NEXT: mov w19, w0 +; ELF-GISEL-NEXT: [[POSTCALL:.L.*]]: + +; ELF-NEXT: [[CALLBB:.L.*]]: +; ELF-NEXT: bl foo +; ELF-NEXT: mov w0, w19 +; ELF-NEXT: ldp x30, x19, [sp], #16 +; ELF-NEXT: ret +; ELF-NEXT: [[LPADBB:.LBB[0-9_]+]]: +; ELF-NEXT: [[LPAD:.L.*]]: +; ELF-NEXT: mov w19, #-1 +; ELF-NEXT: b [[CALLBB]] + +; CHECK-LABEL: GCC_except_table{{.*}}: +; CHECK-NEXT: [[EXCEPT]]: +; CHECK: .uleb128 [[POSTCALL]]-[[PRECALL]] {{.*}} Call between [[PRECALL]] and [[POSTCALL]] +; CHECK-NEXT: .uleb128 [[LPAD]]-[[FNBEGIN]] {{.*}} jumps to [[LPAD]] +; CHECK-NEXT: .byte 0 {{.*}} On action: cleanup + +define i32 @test_invoke_ib_2_direct_mismatch() #0 personality ptr @__gxx_personality_v0 { + %tmp0 = invoke i32 ptrauth (ptr @baz, i32 0, i64 1234)() [ "ptrauth"(i32 1, i64 2) ] to label %continuebb + unwind label %unwindbb + +unwindbb: + %tmp1 = landingpad { ptr, i32 } cleanup + call void @foo() + ret i32 -1 + +continuebb: + call void @foo() + ret i32 %tmp0 +} + +; ELF-LABEL: .L_ZTIPKc.DW.stub: +; ELF-NEXT: .xword _ZTIPKc + declare void @foo() declare void @bar(ptr) +declare i32 @baz() declare i32 @__gxx_personality_v0(...) declare ptr @__cxa_allocate_exception(i64) diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir index 55b3e84f290f8a..c483d669a27584 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir +++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction %s -o - | FileCheck %s # REQUIRES: aarch64-registered-target # Verify that the register class is correctly constrained after the twoaddress replacement diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll index cd06f8dbfad84c..07ee87e880aff2 100644 --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -977,7 +977,7 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) { ; CHECK-APPLE-ARM64_32-NEXT: add x9, x29, #16 ; CHECK-APPLE-ARM64_32-NEXT: strb w8, [x0, #8] ; CHECK-APPLE-ARM64_32-NEXT: orr w8, w9, #0x4 -; CHECK-APPLE-ARM64_32-NEXT: and x10, x9, #0xfffffff0 +; CHECK-APPLE-ARM64_32-NEXT: mov w10, w9 ; CHECK-APPLE-ARM64_32-NEXT: stur w8, [x29, #-8] ; CHECK-APPLE-ARM64_32-NEXT: ldr w11, [x10] ; CHECK-APPLE-ARM64_32-NEXT: orr w10, w9, #0x8 diff --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll index 39edc03ced442e..2451ea478ed71e 100644 --- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll +++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll @@ -107,11 +107,11 @@ define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) { define i32 @overflow_add_positive_const_limit(i8 zeroext %a) { ; CHECK-LABEL: overflow_add_positive_const_limit: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: mov w9, #8 // =0x8 -; CHECK-NEXT: cmp w8, w0, sxtb +; CHECK-NEXT: sxtb w9, w0 ; CHECK-NEXT: mov w8, #16 // =0x10 -; CHECK-NEXT: csel w0, w9, w8, gt +; CHECK-NEXT: cmn w9, #1 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: csel w0, w9, w8, lt ; CHECK-NEXT: ret %cmp = icmp slt i8 %a, -1 %res = select i1 %cmp, i32 8, i32 16 @@ -162,11 +162,11 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) { define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) { ; CHECK-LABEL: overflow_sub_negative_const_limit: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: mov w9, #8 // =0x8 -; CHECK-NEXT: cmp w8, w0, sxtb +; CHECK-NEXT: sxtb w9, w0 ; CHECK-NEXT: mov w8, #16 // =0x10 -; CHECK-NEXT: csel w0, w9, w8, gt +; CHECK-NEXT: cmn w9, #1 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: csel w0, w9, w8, lt ; CHECK-NEXT: ret %cmp = icmp slt i8 %a, -1 %res = select i1 %cmp, i32 8, i32 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 0a8e805027c77a..c701e873fdd2c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -18,6 +18,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -90,6 +91,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -162,6 +164,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -238,6 +241,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -324,8 +328,9 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -538,8 +543,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -746,8 +752,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -972,8 +979,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index fb81176a7419e9..90110e6e0c09ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -18,6 +18,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -90,6 +91,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -162,6 +164,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -238,6 +241,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -324,8 +328,9 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -538,8 +543,9 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -746,8 +752,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -972,8 +979,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index b058ad1023e130..b54aec935bd5ff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -1,249 +1,1247 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %25, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %35, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %22, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], %24, [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %34, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %16, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI %21, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY9]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY10]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY18]], [[COPY19]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY20]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY21]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY19]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY21]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.1 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %28, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %38, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %27, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY8]], %36, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY11]], 0, [[COPY9]], %36, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5 (%ir-block.14): + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %44.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %44.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE2]], %bb.4, [[DEF]], %bb.1 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %19, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[DEF]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %24, %bb.7, [[COPY4]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY15]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY18]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY19]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX90A_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY22]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY23]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE4]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY29]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY27]] + ; GFX90A_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY31]], [[COPY32]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY30]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY33]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY34]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.1 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %45, %bb.5, [[DEF]], %bb.1 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.6 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX90A_DPP-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY29]], 0, [[COPY27]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY30]], 0, [[COPY28]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.6 (%ir-block.41): + ; GFX90A_DPP-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.1 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], %27, [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.8 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 0, %26, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY8]], %35, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY11]], 0, [[COPY9]], %35, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5 (%ir-block.14): + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %43.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY %43.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE2]], %bb.4, [[DEF]], %bb.1 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI %18, %bb.7, [[S_MOV_B]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI %17, %bb.7, [[DEF]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:vreg_64_align2 = PHI %23, %bb.7, [[COPY4]], %bb.2 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_FFBL_B32_e64_1:%[0-9]+]]:vgpr_32 = V_FFBL_B32_e64 [[COPY15]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FFBL_B32_e64_1]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_FFBL_B32_e64_]], [[V_ADD_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY2]], [[V_READFIRSTLANE_B32_4]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY3]], [[V_READFIRSTLANE_B32_5]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_6]], $m0, [[COPY18]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MIN_U32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_8]], $m0, [[COPY19]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]] + ; GFX940_ITERATIVE-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY22]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_NOT_B32_e32_1:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[COPY23]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0 + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec + ; GFX940_ITERATIVE-NEXT: $vcc = COPY [[V_CMP_NE_U64_e64_]] + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.8 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.8.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE4]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.7 + ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY29]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY27]] + ; GFX940_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY31]], [[COPY32]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY30]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY33]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY34]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.3 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.1 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY11]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[REG_SEQUENCE1]], [[COPY12]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY13]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY14]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY15]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY16]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY17]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY18]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B]] + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[COPY19]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY20]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[COPY21]], [[S_MOV_B32_2]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_DPP-NEXT: [[STRICT_WWM:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY23]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4.Flow: + ; GFX940_DPP-NEXT: successors: %bb.6(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %44, %bb.5, [[DEF]], %bb.1 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.6 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[STRICT_WWM1:%[0-9]+]]:vreg_64_align2 = STRICT_WWM [[V_MOV_B6]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY26]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_DPP-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_DPP-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX940_DPP-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY29]], 0, [[COPY27]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY30]], 0, [[COPY28]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_]], %subreg.sub0, [[V_CNDMASK_B32_e64_1]], %subreg.sub1 + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.6 (%ir-block.41): + ; GFX940_DPP-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index a9f0e546eb35b6..48217b9bb0d93b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1293,7 +1293,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr ; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1344,7 +1345,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1385,7 +1387,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt ; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1433,7 +1436,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_mov_b32_e32 v2, 2 -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1485,7 +1489,8 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1529,7 +1534,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1582,7 +1588,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 ; GFX12-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1624,7 +1631,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr, ; GFX12-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4095: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1672,7 +1680,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog @@ -1725,7 +1734,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo -; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir index 6c13756ab1c690..e84c51b73ad1e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -early-live-intervals -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='require,two-address-instruction' -o - %s | FileCheck %s --- name: dyn_extract_v7f64_v_v diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll index 83016f1d2d3c85..fa68722ff67414 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll @@ -93,3 +93,58 @@ define amdgpu_kernel void @constant_from_inttoptr() { store i8 %load, ptr addrspace(1) undef ret void } + +define void @broken_phi() { +; GFX9-LABEL: @broken_phi( +; GFX9-NEXT: bb: +; GFX9-NEXT: br label [[BB1:%.*]] +; GFX9: bb1: +; GFX9-NEXT: [[I:%.*]] = phi <4 x i8> [ , [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ] +; GFX9-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]] +; GFX9: bb2: +; GFX9-NEXT: br label [[BB3]] +; GFX9: bb3: +; GFX9-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ] +; GFX9-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]] +; GFX9: bb5: +; GFX9-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer) +; GFX9-NEXT: br label [[BB7]] +; GFX9: bb7: +; GFX9-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] +; GFX9-NEXT: br label [[BB1]] +; +; GFX12-LABEL: @broken_phi( +; GFX12-NEXT: bb: +; GFX12-NEXT: br label [[BB1:%.*]] +; GFX12: bb1: +; GFX12-NEXT: [[I:%.*]] = phi <4 x i8> [ , [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ] +; GFX12-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]] +; GFX12: bb2: +; GFX12-NEXT: br label [[BB3]] +; GFX12: bb3: +; GFX12-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ] +; GFX12-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]] +; GFX12: bb5: +; GFX12-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer) +; GFX12-NEXT: br label [[BB7]] +; GFX12: bb7: +; GFX12-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] +; GFX12-NEXT: br label [[BB1]] +; +bb: + br label %bb1 +bb1: + %i = phi <4 x i8> [ , %bb ], [ %i8, %bb7 ] + br i1 false, label %bb3, label %bb2 +bb2: + br label %bb3 +bb3: + %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ] + br i1 false, label %bb7, label %bb5 +bb5: + %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer) + br label %bb7 +bb7: + %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ] + br label %bb1 +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index fb764560154d58..b1134ae78cb979 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -574,13 +574,44 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -937,15 +968,46 @@ entry: define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB3_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB3_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB3_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -2011,13 +2073,44 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB7_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB7_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ca4812f345958a..bc5d2662dcb45f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1264 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX1232 %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -46,37 +55,69 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX89-LABEL: add_i32_constant: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB0_2 -; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB0_2: -; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: v_readfirstlane_b32 s4, v1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: add_i32_constant: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: .LBB0_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_constant: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: .LBB0_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry @@ -238,7 +279,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB0_2: @@ -273,7 +315,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB0_2: @@ -567,7 +610,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: @@ -604,7 +648,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mov_b32_e32 v1, s2 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB1_2: @@ -626,642 +671,1230 @@ entry: } define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB2_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB2_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB2_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB2_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB2_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB2_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: add_i32_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB2_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: add_i32_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB2_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB2_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB2_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB2_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB2_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB2_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB2_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: add_i32_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB2_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: add_i32_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB2_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { +; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: .LBB3_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying: +; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB2_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s4 -; GFX8-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_cbranch_execz .LBB3_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_4: +; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying: +; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB2_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s4 -; GFX9-NEXT: v_readlane_b32 s7, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_4: +; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying: +; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: +; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying: +; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: +; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying: +; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying: +; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; -; GFX1264-LABEL: add_i32_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB2_4 -; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB2_4: -; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: add_i32_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB2_4 -; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB2_4: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel - store i32 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB3_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX89-LABEL: add_i64_constant: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB3_2 -; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB3_2: -; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s2, v1 -; GFX89-NEXT: v_readfirstlane_b32 s3, v0 -; GFX89-NEXT: v_mov_b32_e32 v0, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_nop 2 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm -; -; GFX1064-LABEL: add_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: add_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_constant: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s6, s6, 5 -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_constant: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mul_i32 s5, s5, 5 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: add_i64_constant: +; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec @@ -1283,7 +1916,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB3_2: @@ -1320,7 +1954,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: @@ -1669,7 +2304,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: @@ -1709,7 +2345,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: @@ -1733,151 +2370,1367 @@ entry: } define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: add_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX89-LABEL: add_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm -; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: add_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB5_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB5_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1264-LABEL: add_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB5_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1232-LABEL: add_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB5_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB5_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB5_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: add_i64_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB5_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: add_i64_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB5_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB5_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB5_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB5_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s2, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB5_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB5_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB5_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: add_i64_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB5_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: add_i64_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB5_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -2150,7 +4003,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB6_2: @@ -2186,7 +4040,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB6_2: @@ -2375,581 +4230,1134 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i32_uniform: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i32_uniform: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 -; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 -; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 -; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: sub_i32_uniform: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB7_2 -; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB7_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: sub_i32_uniform: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB7_2 -; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 -; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 -; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB7_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm -entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel - store i32 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: sub_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: sub_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB8_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s4 -; GFX8-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8-NEXT: s_add_i32 s6, s6, s7 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB8_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: sub_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB8_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s4 -; GFX9-NEXT: v_readlane_b32 s7, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: sub_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s7, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s6, s6, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB8_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s2 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: sub_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032-NEXT: s_andn2_b32 s0, s0, s6 -; GFX1032-NEXT: s_add_i32 s4, s4, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB8_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s2 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i32_varying: +; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s6, s6, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB8_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_mul_i32 s3, s2, s3 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 ; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB7_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i32_varying: +; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s4, s4, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB8_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB7_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; -; GFX1264-LABEL: sub_i32_varying: +; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264-NEXT: s_clause 0x1 +; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: s_mov_b32 s6, 0 -; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB8_4 -; GFX1264-NEXT: ; %bb.3: -; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264-NEXT: s_cbranch_execz .LBB7_2 +; GFX1264-NEXT: ; %bb.1: +; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s3, s2, s3 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB8_4: -; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB7_2: +; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1264-NEXT: s_endpgm ; -; GFX1232-LABEL: sub_i32_varying: +; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s0, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, 0 -; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232-NEXT: s_clause 0x1 +; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_mov_b32 s1, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB8_4 -; GFX1232-NEXT: ; %bb.3: -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232-NEXT: s_cbranch_execz .LBB7_2 +; GFX1232-NEXT: ; %bb.1: +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s10, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_mov_b32 s10, -1 +; GFX1232-NEXT: v_mov_b32_e32 v1, s2 +; GFX1232-NEXT: s_mov_b32 s8, s6 +; GFX1232-NEXT: s_mov_b32 s9, s7 +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB8_4: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232-NEXT: .LBB7_2: +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s6, -1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1232-NEXT: s_nop 0 ; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1232-NEXT: s_endpgm +entry: + %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB8_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB8_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB8_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB8_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB8_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB8_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB8_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: sub_i32_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB8_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: sub_i32_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB8_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX8_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB8_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB8_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB8_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB8_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB8_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB8_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: sub_i32_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s9 +; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1264_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB8_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: sub_i32_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB8_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel @@ -3254,7 +5662,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB9_2: @@ -3294,7 +5703,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: @@ -3659,7 +6069,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_mov_b32 s8, s6 ; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: global_wb scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: @@ -3703,7 +6114,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX1232-NEXT: s_mov_b32 s12, s6 ; GFX1232-NEXT: s_mov_b32 s13, s7 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1232-NEXT: global_wb scope:SCOPE_DEV +; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: @@ -3731,151 +6143,1367 @@ entry: } define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { -; GFX7LESS-LABEL: sub_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 s10, s6 -; GFX7LESS-NEXT: s_mov_b32 s11, s7 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0) +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX89-LABEL: sub_i64_varying: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: s_mov_b32 s10, s6 -; GFX89-NEXT: s_mov_b32 s11, s7 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 -; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX89-NEXT: s_endpgm -; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_mov_b32 s11, s7 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s2 -; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_mov_b32 s11, s7 -; GFX1164-NEXT: s_mov_b32 s10, s6 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s8, s2 -; GFX1164-NEXT: s_mov_b32 s9, s3 -; GFX1164-NEXT: s_mov_b32 s4, s0 -; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: s_mov_b32 s5, s1 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: sub_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX8_ITERATIVE-NEXT: .LBB11_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_mov_b32 s11, s7 -; GFX1132-NEXT: s_mov_b32 s10, s6 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s8, s2 -; GFX1132-NEXT: s_mov_b32 s9, s3 -; GFX1132-NEXT: s_mov_b32 s4, s0 -; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: s_mov_b32 s5, s1 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: sub_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol +; GFX9_ITERATIVE-NEXT: .LBB11_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1264-LABEL: sub_i64_varying: -; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_mov_b32 s11, s7 -; GFX1264-NEXT: s_mov_b32 s10, s6 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s8, s2 -; GFX1264-NEXT: s_mov_b32 s9, s3 -; GFX1264-NEXT: s_mov_b32 s4, s0 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: s_mov_b32 s5, s1 -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1264-NEXT: s_nop 0 -; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1264-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: sub_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB11_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1232-LABEL: sub_i64_varying: -; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_mov_b32 s11, s7 -; GFX1232-NEXT: s_mov_b32 s10, s6 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s8, s2 -; GFX1232-NEXT: s_mov_b32 s9, s3 -; GFX1232-NEXT: s_mov_b32 s4, s0 -; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: s_mov_b32 s5, s1 -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null -; GFX1232-NEXT: s_nop 0 -; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1232-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: sub_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc +; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB11_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB11_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB11_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX1264_ITERATIVE-LABEL: sub_i64_varying: +; GFX1264_ITERATIVE: ; %bb.0: ; %entry +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1264_ITERATIVE-NEXT: ; %bb.3: +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: .LBB11_4: +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1264_ITERATIVE-NEXT: s_nop 0 +; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_ITERATIVE-NEXT: s_endpgm +; +; GFX1232_ITERATIVE-LABEL: sub_i64_varying: +; GFX1232_ITERATIVE: ; %bb.0: ; %entry +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1232_ITERATIVE-NEXT: ; %bb.3: +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 +; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: .LBB11_4: +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1232_ITERATIVE-NEXT: s_nop 0 +; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 +; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2 +; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3 +; GFX7LESS_DPP-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS_DPP-NEXT: buffer_wbinvl1 +; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0 +; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s10, -1 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s8, s2 +; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc +; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX8_DPP-NEXT: buffer_wbinvl1_vol +; GFX8_DPP-NEXT: .LBB11_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7 +; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s10, -1 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6 +; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc +; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9_DPP-NEXT: buffer_wbinvl1_vol +; GFX9_DPP-NEXT: .LBB11_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl1_inv +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB11_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s2, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc +; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl1_inv +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB11_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1164_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc +; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl1_inv +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB11_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 +; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1132_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], 0 glc +; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl1_inv +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB11_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +; +; GFX1264_DPP-LABEL: sub_i64_varying: +; GFX1264_DPP: ; %bb.0: ; %entry +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48 +; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1264_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1264_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1264_DPP-NEXT: ; %bb.1: +; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1264_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1264_DPP-NEXT: .LBB11_2: +; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 +; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1264_DPP-NEXT: s_nop 0 +; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1264_DPP-NEXT: s_endpgm +; +; GFX1232_DPP-LABEL: sub_i64_varying: +; GFX1232_DPP: ; %bb.0: ; %entry +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1232_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1232_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1232_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1232_DPP-NEXT: ; %bb.1: +; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 +; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1232_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV +; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0 +; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV +; GFX1232_DPP-NEXT: .LBB11_2: +; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 +; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo +; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null +; GFX1232_DPP-NEXT: s_nop 0 +; GFX1232_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3883,6 +7511,3 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 3784af443c7f1f..1439d4b40c951c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,11 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s +; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -15,8 +22,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() ; Show what the atomic optimization pass will do for local pointers. define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -229,8 +234,6 @@ entry: } define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) { -; -; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -460,277 +463,633 @@ entry: } define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: add_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB2_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: add_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB2_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB2_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB2_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB2_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB2_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: add_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB2_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: add_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB2_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: add_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_add_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: add_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB2_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB2_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB2_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB2_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB2_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB2_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -739,191 +1098,428 @@ entry: } define amdgpu_kernel void @add_i32_varying_nouse() { -; GFX7LESS-LABEL: add_i32_varying_nouse: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_add_u32 v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_endpgm +; GFX7LESS_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB3_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_varying_nouse: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s6 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB3_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_u32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB3_4: -; GFX8-NEXT: s_endpgm +; GFX8_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB3_4: +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: add_i32_varying_nouse: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s6 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_add_u32 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB3_4: +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i32_varying_nouse: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s2, s2, s6 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: ds_add_u32 v0, v1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_4: -; GFX1064-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB3_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: add_i32_varying_nouse: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 -; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 -; GFX1032-NEXT: s_add_i32 s0, s0, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB3_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: ds_add_u32 v0, v1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_4: -; GFX1032-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB3_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: add_i32_varying_nouse: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_add_i32 s2, s2, s6 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: ds_add_u32 v0, v1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_4: -; GFX1164-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB3_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1132-LABEL: add_i32_varying_nouse: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_add_i32 s0, s0, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: ds_add_u32 v0, v1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_4: -; GFX1132-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_u32 v0, v1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB3_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i32_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_add_u32 v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i32_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b32 s0, s2 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_u32 v2, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB3_2: +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i32_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b32 s0, s2 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: ds_add_u32 v2, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB3_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i32_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB3_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i32_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB3_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i32_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB3_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i32_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_add_u32 v0, v3 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB3_2: +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -931,8 +1527,6 @@ entry: } define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -1163,8 +1757,6 @@ entry: } define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) { -; -; ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec @@ -1441,120 +2033,1653 @@ entry: } define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB6_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB6_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB6_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB6_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB6_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB6_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: add_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB6_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: add_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB6_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB6_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB6_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB6_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB6_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @add_i64_varying_nouse() { +; GFX7LESS_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_nop 0 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB7_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB7_4: +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB7_4: +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB7_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB7_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 +; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB7_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB7_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: add_i64_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_add_u64 v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm ; +; GFX8_DPP-LABEL: add_i64_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB7_2: +; GFX8_DPP-NEXT: s_endpgm ; -; GFX7LESS-LABEL: add_i64_varying: +; GFX9_DPP-LABEL: add_i64_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB7_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: add_i64_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v3, v1 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v4, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 +; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB7_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: add_i64_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB7_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: add_i64_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 +; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB7_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: add_i64_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB7_2: +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel + ret void +} + +define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: add_i64_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: add_i64_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: add_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: add_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: add_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %zext = zext i32 %lane to i64 - %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel - store i64 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: sub_i32_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: .LBB8_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1572,7 +3697,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 +; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -1581,7 +3706,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -1601,7 +3726,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1609,7 +3734,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1629,7 +3754,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1638,7 +3763,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB7_2: +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1659,7 +3784,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1668,7 +3793,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB7_2: +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1691,7 +3816,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1701,7 +3826,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: +; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -1724,7 +3849,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1733,7 +3858,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: +; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 @@ -1754,8 +3879,6 @@ entry: } define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) { -; -; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec @@ -1765,7 +3888,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,7 +3898,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB8_2: +; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1796,7 +3919,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 +; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +3929,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1827,7 +3950,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1836,7 +3959,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,7 +3980,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1867,7 +3990,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -1888,7 +4011,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1898,7 +4021,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 @@ -1921,7 +4044,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1932,7 +4055,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_2: +; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1956,7 +4079,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1966,7 +4089,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_2: +; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1987,1097 +4110,2558 @@ entry: } define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB10_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB10_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB10_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB10_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB10_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB10_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: sub_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_sub_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB10_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB10_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB10_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB10_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB10_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB10_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @sub_i32_varying_nouse() { +; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB11_4: +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB11_4: +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB11_4: +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB11_4: +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB11_4: +; GFX1164_ITERATIVE-NEXT: s_endpgm ; +; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_u32 v0, v1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB11_4: +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i32_varying: +; GFX7LESS_DPP-LABEL: sub_i32_varying_nouse: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: sub_i32_varying_nouse: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b32 s0, s2 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB11_2: +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: sub_i32_varying_nouse: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b32 s0, s2 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: ds_sub_u32 v2, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB11_2: +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: sub_i32_varying_nouse: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: s_add_i32 s0, s2, s3 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB11_2: +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: sub_i32_varying_nouse: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB11_2: +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: sub_i32_varying_nouse: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB11_2: +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: sub_i32_varying_nouse: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: ds_sub_u32 v0, v3 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB11_2: +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel + ret void +} + +define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB12_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i32_varying: +; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB9_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB9_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_cbranch_execz .LBB12_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_4: +; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i32_varying: +; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB9_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i32_varying: +; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_add_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB9_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_4: +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i32_varying: +; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_add_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB9_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_4: +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i32_varying: +; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_add_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB9_4 -; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_4: +; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i32_varying: +; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_add_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB9_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB12_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel - store i32 %old, ptr addrspace(1) %out + %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel + store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i32_varying_nouse() { -; GFX7LESS-LABEL: sub_i32_varying_nouse: +define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) { +; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB13_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i32_varying_nouse: +; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: .LBB10_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s6 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB10_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB13_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 +; GFX8-NEXT: s_mul_i32 s6, s3, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_u32 v0, v1 +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: .LBB13_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB10_4: +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i32_varying_nouse: +; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: .LBB10_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s6 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB10_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_sub_u32 v0, v1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB10_4: +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB13_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i32_varying_nouse: +; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_add_i32 s2, s2, s6 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB10_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: ds_sub_u32 v0, v1 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s7, s3, s6 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_4: +; GFX1064-NEXT: .LBB13_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i32_varying_nouse: +; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 -; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 -; GFX1032-NEXT: s_add_i32 s0, s0, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB10_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: ds_sub_u32 v0, v1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_4: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i32_varying_nouse: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164-NEXT: s_add_i32 s2, s2, s6 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB10_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: ds_sub_u32 v0, v1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_4: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i32_varying_nouse: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 -; GFX1132-NEXT: s_add_i32 s0, s0, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB10_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1132-NEXT: ds_sub_u32 v0, v1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_4: -; GFX1132-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel - ret void -} - -define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: sub_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v0 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: sub_i64_constant: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB11_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v1 -; GFX8-NEXT: v_readfirstlane_b32 s5, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: sub_i64_constant: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: sub_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB11_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: sub_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s6, s3, s5 +; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: s_add_i32 s7, s7, s6 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB11_2: +; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_constant: +; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s7, s3, s6 +; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1164-NEXT: s_mul_i32 s6, s2, s6 +; GFX1164-NEXT: s_add_i32 s8, s8, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s8 +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: .LBB13_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, v5 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_constant: +; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s6, s3, s5 +; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1132-NEXT: s_mul_i32 s5, s2, s5 +; GFX1132-NEXT: s_add_i32 s7, s7, s6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: .LBB13_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, v5 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel + %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) { +define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX7LESS_ITERATIVE-NEXT: v_sub_i32_e32 v0, vcc, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: sub_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB14_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_sub_u32_e32 v0, vcc, s5, v1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i64_uniform: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB12_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v2 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: sub_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB14_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i64_uniform: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB12_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: sub_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB14_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i64_uniform: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB12_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: sub_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB14_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i64_uniform: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s7, s3, s6 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB12_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: sub_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB14_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: sub_i64_uniform: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s6, s3, s5 -; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 -; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB12_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: sub_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB14_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: sub_i64_uniform: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s7, s3, s6 -; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX1164-NEXT: s_mul_i32 s6, s2, s6 -; GFX1164-NEXT: s_add_i32 s8, s8, s7 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: v_mov_b32_e32 v1, s8 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB12_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1164-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] -; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: sub_i64_uniform: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s6, s3, s5 -; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1132-NEXT: s_mul_i32 s5, s2, s5 -; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB12_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] -; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm -entry: - %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel - store i64 %old, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_DPP-LABEL: sub_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; +; GFX8_DPP-LABEL: sub_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB14_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_subb_u32_e32 v8, vcc, v0, v8, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm ; -; GFX7LESS-LABEL: sub_i64_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_DPP-LABEL: sub_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB14_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_subb_co_u32_e32 v8, vcc, v0, v8, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm ; -; GFX8-LABEL: sub_i64_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_DPP-LABEL: sub_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB14_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm ; -; GFX9-LABEL: sub_i64_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_DPP-LABEL: sub_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB14_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm ; -; GFX10-LABEL: sub_i64_varying: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX10-NEXT: s_endpgm -; -; GFX1164-LABEL: sub_i64_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX1164_DPP-LABEL: sub_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB14_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: sub_i64_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX1132_DPP-LABEL: sub_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB14_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -3087,277 +6671,638 @@ entry: } define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: and_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: and_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB15_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: and_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: and_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB15_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: and_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, -1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB14_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_and_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB14_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: and_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB15_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: and_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, -1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB14_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: and_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB15_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: and_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_and_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB14_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: and_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB15_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: and_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_and_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB14_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: and_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB15_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: and_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_and_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB14_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_and_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: and_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_and_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: and_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_and_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB14_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_and_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: and_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB15_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: and_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB15_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: and_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB15_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: and_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB15_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: and_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB15_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: and_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB15_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -3365,1394 +7310,4362 @@ entry: ret void } -define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: and_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: and_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB16_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: or_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: and_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB16_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: or_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB15_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB15_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: and_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB16_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: or_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB15_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_or_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: and_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB16_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: or_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_or_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB15_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: and_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB16_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: or_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_or_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB15_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: and_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB16_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: or_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_or_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB15_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_or_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: and_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_and_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: or_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_or_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB15_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_or_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: and_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB16_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: and_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB16_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: and_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB16_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: and_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB16_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: and_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB16_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: and_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB16_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel - store i32 %old, ptr addrspace(1) %out + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw and ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: or_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: or_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB17_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: xor_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: or_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB17_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: xor_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB16_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_xor_b32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB16_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: or_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB17_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: xor_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB16_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: or_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB17_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: xor_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB16_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: or_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB17_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: xor_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_xor_b32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB16_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: or_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB17_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: xor_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB16_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_xor_rtn_b32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: or_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_or_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: xor_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_xor_b32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB16_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_xor_rtn_b32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: or_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB17_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: or_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB17_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: or_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB17_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: or_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB17_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: or_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB17_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: or_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB17_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel + %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: or_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: or_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB18_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: max_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: or_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB18_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: max_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB17_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_max_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB17_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: or_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB18_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: max_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_brev_b32 s4, 1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB17_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_max_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: or_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB18_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: max_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_brev_b32 s4, 1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_max_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB17_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: or_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB18_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: max_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, 1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB17_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: or_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB18_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: max_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_max_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB17_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_i32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: or_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_or_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: max_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, 1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB17_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_i32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: or_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB18_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: or_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB18_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: or_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB18_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: or_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB18_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: or_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB18_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: or_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB18_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw or ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: xor_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB19_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: xor_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB19_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: xor_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB19_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: xor_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB19_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: xor_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB19_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: xor_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB19_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: xor_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_xor_rtn_b32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: xor_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB19_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: xor_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB19_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: xor_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB19_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: xor_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB19_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: xor_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB19_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: xor_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB19_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { +define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX7LESS_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: xor_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB20_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: max_i64_constant: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB18_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: xor_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB20_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9_ITERATIVE-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: max_i64_constant: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB18_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB18_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: xor_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB20_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: max_i64_constant: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB18_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: xor_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB20_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: max_i64_constant: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB18_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB18_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: xor_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB20_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: max_i64_constant: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB18_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB18_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo -; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: xor_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB20_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: max_i64_constant: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB18_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 5 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB18_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: xor_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_xor_rtn_b64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: max_i64_constant: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB18_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB18_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: xor_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB20_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX8_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: xor_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB20_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9_DPP-NEXT: v_xor_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: xor_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB20_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: xor_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB20_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: xor_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB20_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: xor_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB20_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: - %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw xor ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: max_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: max_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB21_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: min_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: max_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB21_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_max_i32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: min_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB19_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_min_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB19_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: max_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB21_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: min_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB19_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_min_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: max_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, 1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB21_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: min_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_brev_b32 s4, -2 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_min_i32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB19_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: max_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB21_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: min_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_brev_b32 s0, -2 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_i32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB19_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: max_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, 1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB21_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: min_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_min_i32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB19_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_i32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: max_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_i32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: min_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_brev_b32 s0, -2 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_i32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB19_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_i32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: max_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB21_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: max_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB21_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: max_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB21_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: max_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB21_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: max_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB21_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: max_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB21_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel + %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel store i32 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { -; -; -; GFX7LESS-LABEL: min_i64_constant: +define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: max_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB20_2: +; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -4760,30 +11673,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: min_i64_constant: +; GFX8-LABEL: max_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB20_2: +; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4794,29 +11707,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: min_i64_constant: +; GFX9-LABEL: max_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB20_2: +; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4827,31 +11740,31 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: min_i64_constant: +; GFX1064-LABEL: max_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB20_2: +; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc -; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc +; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -4860,30 +11773,30 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: min_i64_constant: +; GFX1032-LABEL: max_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB20_2: +; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo -; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo +; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -4892,7 +11805,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: min_i64_constant: +; GFX1164-LABEL: max_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4900,23 +11813,23 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB20_2: +; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc -; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -4927,29 +11840,29 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: min_i64_constant: +; GFX1132-LABEL: max_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] +; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB20_2: +; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo -; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -4960,283 +11873,3632 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: - %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel + %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { +define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: max_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB23_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: max_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB23_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: umax_i32_varying: +; GFX9_ITERATIVE-LABEL: max_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB23_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: max_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB23_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: max_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB23_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: max_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB23_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: max_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB23_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: max_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_i64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: max_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: s_brev_b32 s1, 1 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s1, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB23_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: max_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: s_brev_b32 s1, 1 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s1, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB23_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: max_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB23_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: max_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB23_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: max_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB23_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: max_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB23_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw max ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: min_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: min_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB24_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: min_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB24_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: min_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB24_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: min_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, -2 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB24_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: min_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB24_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: min_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, -2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB24_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: min_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_i32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: min_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB24_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: min_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB24_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: min_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB24_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: min_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB24_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: min_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB24_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: min_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB24_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel + store i32 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { +; GFX7LESS-LABEL: min_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: .LBB25_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; -; GFX8-LABEL: umax_i32_varying: +; GFX8-LABEL: min_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB21_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_max_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB21_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_cbranch_execz .LBB25_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_4: +; GFX8-NEXT: .LBB25_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; -; GFX9-LABEL: umax_i32_varying: +; GFX9-LABEL: min_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB21_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_max_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: .LBB25_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: umax_i32_varying: +; GFX1064-LABEL: min_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_max_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB21_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1064-NEXT: s_cbranch_execz .LBB25_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, 5 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_4: +; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc +; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; -; GFX1032-LABEL: umax_i32_varying: +; GFX1032-LABEL: min_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_max_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB21_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB25_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, 5 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_4: +; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo +; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; -; GFX1164-LABEL: umax_i32_varying: +; GFX1164-LABEL: min_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_max_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB21_4 -; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_cbranch_execz .LBB25_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_4: +; GFX1164-NEXT: .LBB25_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; -; GFX1132-LABEL: umax_i32_varying: +; GFX1132-LABEL: min_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_max_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB21_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB25_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mov_b32_e32 v0, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: .LBB25_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm +entry: + %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: min_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB26_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: min_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB26_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: min_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB26_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: min_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB26_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: min_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB26_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: min_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB26_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: min_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB26_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: min_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_i64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: min_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_brev_b32 s7, -2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB26_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: min_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_brev_b32 s7, -2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB26_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: min_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB26_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: min_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB26_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: min_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB26_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: min_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB26_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw min ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umax_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB27_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umax_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB27_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_max_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umax_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB27_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umax_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB27_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umax_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB27_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umax_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB27_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umax_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umax_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB27_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umax_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB27_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umax_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB27_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umax_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB27_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umax_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB27_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umax_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB27_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -5245,8 +15507,6 @@ entry: } define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: umax_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -5254,7 +15514,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5262,7 +15522,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB22_2: +; GFX7LESS-NEXT: .LBB28_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5287,7 +15547,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5295,7 +15555,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB22_2: +; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5320,14 +15580,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB22_2: +; GFX9-NEXT: .LBB28_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5352,7 +15612,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_cbranch_execz .LBB28_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5360,7 +15620,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB22_2: +; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5384,7 +15644,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_cbranch_execz .LBB28_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5392,7 +15652,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB22_2: +; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5418,7 +15678,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB22_2 +; GFX1164-NEXT: s_cbranch_execz .LBB28_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5426,7 +15686,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB22_2: +; GFX1164-NEXT: .LBB28_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5452,14 +15712,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB22_2 +; GFX1132-NEXT: s_cbranch_execz .LBB28_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB22_2: +; GFX1132-NEXT: .LBB28_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -5483,278 +15743,1682 @@ entry: ret void } +define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB29_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umax_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB29_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umax_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB29_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umax_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB29_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umax_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB29_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umax_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB29_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umax_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB29_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umax_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_max_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umax_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB29_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umax_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB29_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umax_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB29_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umax_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB29_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umax_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB29_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umax_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB29_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw umax ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm ; +; GFX8_ITERATIVE-LABEL: umin_i32_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB30_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm ; -; GFX7LESS-LABEL: umin_i32_varying: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm +; GFX9_ITERATIVE-LABEL: umin_i32_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB30_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_min_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm ; -; GFX8-LABEL: umin_i32_varying: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, -1 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: .LBB23_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_min_u32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB23_4 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm +; GFX1064_ITERATIVE-LABEL: umin_i32_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB30_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm ; -; GFX9-LABEL: umin_i32_varying: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, -1 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: .LBB23_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_min_u32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX1032_ITERATIVE-LABEL: umin_i32_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 +; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB30_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm ; -; GFX1064-LABEL: umin_i32_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064-NEXT: s_min_u32 s4, s4, s8 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB23_4 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_4: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm +; GFX1164_ITERATIVE-LABEL: umin_i32_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB30_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm ; -; GFX1032-LABEL: umin_i32_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_mov_b32 s0, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032-NEXT: s_min_u32 s0, s0, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB23_4 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_4: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX1132_ITERATIVE-LABEL: umin_i32_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB30_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm ; -; GFX1164-LABEL: umin_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-NEXT: s_min_u32 s4, s4, s8 -; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164-NEXT: ; implicit-def: $vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB23_4 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: ds_min_rtn_u32 v1, v1, v2 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_4: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_nop 0 -; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1164-NEXT: s_endpgm +; GFX7LESS_DPP-LABEL: umin_i32_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_u32 v0, v1, v0 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm ; -; GFX1132-LABEL: umin_i32_varying: -; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, -1 -; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: s_min_u32 s0, s0, s5 -; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132-NEXT: ; implicit-def: $vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB23_4 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132-NEXT: ds_min_rtn_u32 v1, v1, v2 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_4: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_nop 0 -; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX1132-NEXT: s_endpgm +; GFX8_DPP-LABEL: umin_i32_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 1 +; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB30_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umin_i32_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: s_nop 1 +; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB30_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umin_i32_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB30_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umin_i32_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB30_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umin_i32_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB30_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umin_i32_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB30_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel @@ -5763,8 +17427,6 @@ entry: } define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { -; -; ; GFX7LESS-LABEL: umin_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -5772,7 +17434,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 +; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5780,7 +17442,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB24_2: +; GFX7LESS-NEXT: .LBB31_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -5805,7 +17467,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB24_2 +; GFX8-NEXT: s_cbranch_execz .LBB31_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5813,7 +17475,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB24_2: +; GFX8-NEXT: .LBB31_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5838,14 +17500,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB31_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB24_2: +; GFX9-NEXT: .LBB31_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5870,7 +17532,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064-NEXT: s_cbranch_execz .LBB31_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5878,7 +17540,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB24_2: +; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5902,7 +17564,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032-NEXT: s_cbranch_execz .LBB31_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5910,7 +17572,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB24_2: +; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 @@ -5936,7 +17598,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164-NEXT: s_cbranch_execz .LBB31_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5944,7 +17606,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB24_2: +; GFX1164-NEXT: .LBB31_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5970,14 +17632,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132-NEXT: s_cbranch_execz .LBB31_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB24_2: +; GFX1132-NEXT: .LBB31_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -6000,5 +17662,1046 @@ entry: store i64 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} + +define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { +; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying: +; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB32_1 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX7LESS_ITERATIVE-NEXT: s_endpgm +; +; GFX8_ITERATIVE-LABEL: umin_i64_varying: +; GFX8_ITERATIVE: ; %bb.0: ; %entry +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX8_ITERATIVE-NEXT: ; %bb.3: +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 +; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: .LBB32_4: +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX8_ITERATIVE-NEXT: s_endpgm +; +; GFX9_ITERATIVE-LABEL: umin_i64_varying: +; GFX9_ITERATIVE: ; %bb.0: ; %entry +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX9_ITERATIVE-NEXT: ; %bb.3: +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: .LBB32_4: +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX9_ITERATIVE-NEXT: s_endpgm +; +; GFX1064_ITERATIVE-LABEL: umin_i64_varying: +; GFX1064_ITERATIVE: ; %bb.0: ; %entry +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1064_ITERATIVE-NEXT: ; %bb.3: +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1064_ITERATIVE-NEXT: .LBB32_4: +; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1064_ITERATIVE-NEXT: s_endpgm +; +; GFX1032_ITERATIVE-LABEL: umin_i64_varying: +; GFX1032_ITERATIVE: ; %bb.0: ; %entry +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1032_ITERATIVE-NEXT: ; %bb.3: +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1032_ITERATIVE-NEXT: .LBB32_4: +; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; GFX1032_ITERATIVE-NEXT: s_endpgm +; +; GFX1164_ITERATIVE-LABEL: umin_i64_varying: +; GFX1164_ITERATIVE: ; %bb.0: ; %entry +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 6 +; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1164_ITERATIVE-NEXT: ; %bb.3: +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1164_ITERATIVE-NEXT: .LBB32_4: +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: s_nop 0 +; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_ITERATIVE-NEXT: s_endpgm +; +; GFX1132_ITERATIVE-LABEL: umin_i64_varying: +; GFX1132_ITERATIVE: ; %bb.0: ; %entry +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 6 +; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop +; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 +; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 +; GFX1132_ITERATIVE-NEXT: ; %bb.3: +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv +; GFX1132_ITERATIVE-NEXT: .LBB32_4: +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: s_nop 0 +; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_ITERATIVE-NEXT: s_endpgm +; +; GFX7LESS_DPP-LABEL: umin_i64_varying: +; GFX7LESS_DPP: ; %bb.0: ; %entry +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: ds_min_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS_DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS_DPP-NEXT: s_endpgm +; +; GFX8_DPP-LABEL: umin_i64_varying: +; GFX8_DPP: ; %bb.0: ; %entry +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: s_not_b64 exec, exec +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX8_DPP-NEXT: ; %bb.1: +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: s_mov_b32 m0, -1 +; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: .LBB32_2: +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: s_endpgm +; +; GFX9_DPP-LABEL: umin_i64_varying: +; GFX9_DPP: ; %bb.0: ; %entry +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: s_not_b64 exec, exec +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX9_DPP-NEXT: ; %bb.1: +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: .LBB32_2: +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: s_endpgm +; +; GFX1064_DPP-LABEL: umin_i64_varying: +; GFX1064_DPP: ; %bb.0: ; %entry +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1064_DPP-NEXT: s_not_b64 exec, exec +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1064_DPP-NEXT: ; %bb.1: +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_gl0_inv +; GFX1064_DPP-NEXT: .LBB32_2: +; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: s_endpgm +; +; GFX1032_DPP-LABEL: umin_i64_varying: +; GFX1032_DPP: ; %bb.0: ; %entry +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1032_DPP-NEXT: ; %bb.1: +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_gl0_inv +; GFX1032_DPP-NEXT: .LBB32_2: +; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: s_endpgm +; +; GFX1164_DPP-LABEL: umin_i64_varying: +; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1164_DPP-NEXT: ; %bb.1: +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_gl0_inv +; GFX1164_DPP-NEXT: .LBB32_2: +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: s_nop 0 +; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164_DPP-NEXT: s_endpgm +; +; GFX1132_DPP-LABEL: umin_i64_varying: +; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 +; GFX1132_DPP-NEXT: ; %bb.1: +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_gl0_inv +; GFX1132_DPP-NEXT: .LBB32_2: +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: s_nop 0 +; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132_DPP-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %lane_ext = zext i32 %lane to i64 + %old = atomicrmw umin ptr addrspace(3) @local_var32, i64 %lane_ext acq_rel + store i64 %old, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 995d3fee672913..f636fa5d83a57a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -573,13 +573,44 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1625,13 +1656,44 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB6_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB6_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX6-NEXT: .LBB6_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 720e2ef108076d..3e8565d34c6beb 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -589,14 +589,45 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB2_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB2_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB2_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1798,14 +1829,45 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: .LBB7_1: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s8, v0, s5 +; GFX6-NEXT: v_writelane_b32 v1, s4, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB7_4 +; GFX6-NEXT: ; %bb.3: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc +; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX6-NEXT: .LBB7_4: +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index c9076a9541b237..22e00b2f5a6b1a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -102,8 +102,9 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: global_wb scope:SCOPE_SYS ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SYS ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -213,8 +214,9 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] @@ -347,8 +349,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: s_setpc_b64 s[30:31] @@ -446,8 +449,9 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1200-NEXT: v_mov_b32_e32 v4, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX1200-NEXT: global_wb scope:SCOPE_SE ; GFX1200-NEXT: s_wait_storecnt 0x0 -; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX1200-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0cdd6b919f1c8b..23e8f98a7861bc 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -234,6 +235,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -424,6 +426,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -853,9 +856,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1127,8 +1131,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1407,8 +1411,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1924,14 +1928,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2294,14 +2298,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2670,14 +2674,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3294,22 +3298,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3719,22 +3723,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4150,22 +4154,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4815,6 +4819,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5086,6 +5091,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -5338,6 +5344,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -5838,6 +5845,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6245,6 +6253,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -6643,6 +6652,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7313,9 +7323,10 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 503065cc076477..ec0408236975d1 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -221,6 +222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -416,6 +418,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -821,11 +824,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1037,10 +1040,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1265,9 +1269,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1679,15 +1684,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2064,15 +2070,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2455,15 +2462,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 @@ -3094,22 +3102,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3521,22 +3529,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3954,22 +3962,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4631,11 +4639,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4973,9 +4981,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5324,9 +5333,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 @@ -5922,27 +5932,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6367,11 +6377,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 @@ -6818,11 +6828,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 @@ -7513,11 +7523,11 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index e0e6ccd72caeaa..cd01cc7309fcd2 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -22,6 +22,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -221,6 +222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -416,6 +418,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -821,11 +824,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1037,10 +1040,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1265,9 +1269,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 @@ -1679,15 +1684,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2064,15 +2070,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2455,15 +2462,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 @@ -3094,22 +3102,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3521,22 +3529,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3954,22 +3962,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 @@ -4631,11 +4639,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4973,9 +4981,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5324,9 +5333,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 @@ -5922,27 +5932,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6367,11 +6377,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 @@ -6818,11 +6828,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 @@ -7513,11 +7523,11 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll index c2469398110466..95ae8a6adfdf81 100644 --- a/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll +++ b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll @@ -1,5 +1,3 @@ -; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error: -; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error: ; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error: ; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error: diff --git a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir index 7a151c4530a03e..bf111d19f2147b 100644 --- a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir +++ b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX90A %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes='require,two-address-instruction' -o - %s | FileCheck --check-prefix=GFX90A %s --- name: aligned_partial_vgpr_64 diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index 9b20d9be278c6b..82dc6d21cfe33d 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -1,9 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. -; GCN-LABEL: wwm: define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { +; GCN-LABEL: wwm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s6, s3 +; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_mov_b32 s4, s1 +; GCN-NEXT: s_mov_b32 s1, 1 +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb42 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: .LBB0_2: ; %bb602 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GCN-NEXT: s_cbranch_vccnz .LBB0_4 +; GCN-NEXT: ; %bb.3: ; %bb49 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc +; GCN-NEXT: .LBB0_4: ; %bb54 +; GCN-NEXT: s_endpgm entry: br label %work @@ -23,24 +50,44 @@ bb54: ret void work: -; GCN: s_not_b64 exec, exec -; GCN: v_mov_b32_e32 v[[tmp1189:[0-9]+]], 1 -; GCN: s_not_b64 exec, exec %tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1) -; GCN: s_or_saveexec_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], -1 -; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]] %tmp1191 = mul i32 %tmp1189, 4 -; GCN: s_mov_b64 exec, s[[[LO]]:[[HI]]] %tmp1196 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp1191) %tmp34 = icmp eq i32 %arg, 0 br i1 %tmp34, label %bb602, label %bb42 } -; GCN-LABEL: strict_wwm: define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { +; GCN-LABEL: strict_wwm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s6, s3 +; GCN-NEXT: s_mov_b32 s5, s2 +; GCN-NEXT: s_mov_b32 s4, s1 +; GCN-NEXT: s_mov_b32 s1, 1 +; GCN-NEXT: v_mov_b32_e32 v0, 4 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %bb42 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: .LBB1_2: ; %bb602 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GCN-NEXT: s_cbranch_vccnz .LBB1_4 +; GCN-NEXT: ; %bb.3: ; %bb49 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc +; GCN-NEXT: .LBB1_4: ; %bb54 +; GCN-NEXT: s_endpgm entry: br label %work @@ -60,16 +107,10 @@ bb54: ret void work: -; GCN: s_not_b64 exec, exec -; GCN: v_mov_b32_e32 v[[tmp1189:[0-9]+]], 1 -; GCN: s_not_b64 exec, exec %tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1) -; GCN: s_or_saveexec_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], -1 -; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]] %tmp1191 = mul i32 %tmp1189, 4 -; GCN: s_mov_b64 exec, s[[[LO]]:[[HI]]] %tmp1196 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp1191) %tmp34 = icmp eq i32 %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index af7e11127fbaea..c5c44d27efbb36 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -196,8 +197,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -382,8 +384,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -576,8 +579,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -782,8 +786,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -999,8 +1004,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1229,8 +1235,9 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1413,8 +1420,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1593,8 +1601,9 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1765,8 +1774,9 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1982,8 +1992,9 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2199,8 +2210,9 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2420,8 +2432,9 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2596,8 +2609,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2782,8 +2796,9 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2976,8 +2991,9 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3182,8 +3198,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3399,8 +3416,9 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3629,8 +3647,9 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3813,8 +3832,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3995,8 +4015,9 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -4179,8 +4200,9 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -4353,8 +4375,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4529,8 +4552,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4735,8 +4759,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4911,8 +4936,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5117,8 +5143,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5293,8 +5320,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5499,8 +5527,9 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5675,8 +5704,9 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5893,8 +5923,9 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6073,8 +6104,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6254,8 +6286,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6445,8 +6478,9 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6613,8 +6647,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6788,8 +6823,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -6996,8 +7032,9 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7284,8 +7321,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7581,8 +7619,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7876,8 +7915,9 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8153,8 +8193,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8439,8 +8480,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8714,8 +8756,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8929,8 +8972,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9162,8 +9206,9 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9459,8 +9504,9 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9760,8 +9806,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10102,8 +10149,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10454,8 +10502,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10805,8 +10854,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11146,8 +11196,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11478,8 +11529,9 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11759,8 +11811,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12044,8 +12097,9 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12377,8 +12431,9 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -12729,8 +12784,9 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13046,8 +13102,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13233,8 +13290,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13423,8 +13481,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13627,8 +13686,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13806,8 +13866,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13992,8 +14053,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14193,8 +14255,9 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14385,8 +14448,9 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14573,8 +14637,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14760,8 +14825,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14939,8 +15005,9 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15126,8 +15193,9 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15309,8 +15377,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15584,8 +15653,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15862,8 +15932,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16154,8 +16225,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16421,8 +16493,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16695,8 +16768,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -16984,8 +17058,9 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17264,8 +17339,9 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17540,8 +17616,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -17815,8 +17892,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -18082,8 +18160,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -18357,8 +18436,9 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index e6d6652c8b7b71..a3424793fdc4d5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -162,8 +163,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -310,8 +312,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -473,8 +476,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -613,8 +617,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -760,8 +765,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -933,8 +939,9 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1148,8 +1155,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1350,8 +1358,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1492,8 +1501,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1638,8 +1648,9 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1780,8 +1791,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1928,8 +1940,9 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2091,8 +2104,9 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2231,8 +2245,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2378,8 +2393,9 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2551,8 +2567,9 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2766,8 +2783,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2982,8 +3000,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3135,8 +3154,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3293,8 +3313,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3464,8 +3485,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3612,8 +3634,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3767,8 +3790,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3937,8 +3961,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4090,8 +4115,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4261,8 +4287,9 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4563,8 +4590,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4874,8 +4902,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5184,8 +5213,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5477,8 +5507,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5779,8 +5810,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6071,8 +6103,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6305,8 +6338,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6547,8 +6581,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6859,8 +6894,9 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7173,8 +7209,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7516,8 +7553,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7869,8 +7907,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8219,8 +8258,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8551,8 +8591,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8893,8 +8934,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9226,8 +9268,9 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9508,8 +9551,9 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9798,8 +9842,9 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10151,8 +10196,9 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10479,8 +10525,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10713,8 +10760,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10950,8 +10998,9 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11204,8 +11253,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11429,8 +11479,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11661,8 +11712,9 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11912,8 +11964,9 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12150,8 +12203,9 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12407,8 +12461,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12751,8 +12806,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13098,8 +13154,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13461,8 +13518,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13794,8 +13852,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14134,8 +14193,9 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14494,8 +14554,9 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14841,8 +14902,9 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 6babcfd15ee1ac..0d954e277cdd58 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -20,8 +20,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -162,8 +163,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -310,8 +312,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -473,8 +476,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -613,8 +617,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -760,8 +765,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -933,8 +939,9 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1148,8 +1155,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1350,8 +1358,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1492,8 +1501,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1638,8 +1648,9 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1780,8 +1791,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1928,8 +1940,9 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2091,8 +2104,9 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2231,8 +2245,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2378,8 +2393,9 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 +; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2551,8 +2567,9 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2766,8 +2783,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2982,8 +3000,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3135,8 +3154,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3293,8 +3313,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3464,8 +3485,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3612,8 +3634,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3767,8 +3790,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3937,8 +3961,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4090,8 +4115,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4261,8 +4287,9 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4563,8 +4590,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4874,8 +4902,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5184,8 +5213,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5477,8 +5507,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5779,8 +5810,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6071,8 +6103,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6305,8 +6338,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6547,8 +6581,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6859,8 +6894,9 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7173,8 +7209,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7516,8 +7553,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7869,8 +7907,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8219,8 +8258,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8551,8 +8591,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8893,8 +8934,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9226,8 +9268,9 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9508,8 +9551,9 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9798,8 +9842,9 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10151,8 +10196,9 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10479,8 +10525,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10713,8 +10760,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10950,8 +10998,9 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11204,8 +11253,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11429,8 +11479,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11661,8 +11712,9 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11912,8 +11964,9 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12150,8 +12203,9 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12407,8 +12461,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12751,8 +12806,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13098,8 +13154,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13461,8 +13518,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13794,8 +13852,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14134,8 +14193,9 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14494,8 +14554,9 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14841,8 +14902,9 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index bce522c0c3f0a4..47eb89eed9019c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -28,8 +28,9 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -222,8 +223,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -420,8 +422,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -634,8 +637,9 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -818,8 +822,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1009,8 +1014,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1220,8 +1226,9 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1418,8 +1425,9 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1617,8 +1625,9 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1811,8 +1820,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2009,8 +2019,9 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2223,8 +2234,9 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2407,8 +2419,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2598,8 +2611,9 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2809,8 +2823,9 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3007,8 +3022,9 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3206,8 +3222,9 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3416,8 +3433,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3627,8 +3645,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3851,8 +3870,9 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4045,8 +4065,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4246,8 +4267,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4485,8 +4507,9 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4773,8 +4796,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5070,8 +5094,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5365,8 +5390,9 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5642,8 +5668,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5928,8 +5955,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6205,8 +6233,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6424,8 +6453,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6651,8 +6681,9 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6948,8 +6979,9 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7249,8 +7281,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7591,8 +7624,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7943,8 +7977,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8292,8 +8327,9 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8623,8 +8659,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -8964,8 +9001,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9296,8 +9334,9 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9577,8 +9616,9 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9866,8 +9906,9 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10218,8 +10259,9 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10543,8 +10585,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10760,8 +10803,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10980,8 +11024,9 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11216,8 +11261,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11422,8 +11468,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11635,8 +11682,9 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11868,8 +11916,9 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12088,8 +12137,9 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12329,8 +12379,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -12673,8 +12724,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13020,8 +13072,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13383,8 +13436,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13716,8 +13770,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14056,8 +14111,9 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -14416,8 +14472,9 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14763,8 +14820,9 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 5420733b7dc557..86e6224d2f8d56 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -40,7 +40,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -97,7 +98,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -160,7 +162,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -223,7 +226,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -270,7 +274,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -322,7 +327,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -380,7 +386,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -438,7 +445,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -488,7 +496,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -545,7 +554,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -608,7 +618,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -671,7 +682,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -718,7 +730,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -770,7 +783,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -828,7 +842,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -886,7 +901,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -936,7 +952,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -993,7 +1010,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1056,7 +1074,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1119,7 +1138,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1166,7 +1186,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1218,7 +1239,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1276,7 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1334,7 +1357,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1382,7 +1406,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1439,7 +1464,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1500,7 +1526,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1563,7 +1590,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1608,7 +1636,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1660,7 +1689,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1716,7 +1746,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1774,7 +1805,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1822,7 +1854,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1879,7 +1912,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -1940,7 +1974,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2003,7 +2038,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2048,7 +2084,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2100,7 +2137,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2156,7 +2194,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2214,7 +2253,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2262,7 +2302,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2319,7 +2360,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2380,7 +2422,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2443,7 +2486,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2488,7 +2532,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2540,7 +2585,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2596,7 +2642,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2654,7 +2701,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2702,7 +2750,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2759,7 +2808,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -2820,7 +2870,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2883,7 +2934,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2928,7 +2980,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2980,7 +3033,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3036,7 +3090,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3094,7 +3149,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3144,7 +3200,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3201,7 +3258,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3264,7 +3322,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3327,7 +3386,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3374,7 +3434,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3426,7 +3487,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3484,7 +3546,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3542,7 +3605,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3592,7 +3656,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3639,7 +3704,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3686,7 +3752,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3743,7 +3810,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -3806,7 +3874,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3869,7 +3938,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3916,7 +3986,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -3968,7 +4039,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4026,7 +4098,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4084,7 +4157,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4134,7 +4208,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4191,7 +4266,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4254,7 +4330,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4317,7 +4394,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4364,7 +4442,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4416,7 +4495,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -4474,7 +4554,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4532,7 +4613,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4584,7 +4666,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4631,7 +4713,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4694,7 +4776,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4755,7 +4837,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -4800,7 +4882,8 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4837,7 +4920,8 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic i64 %in, ptr %out seq_cst, align 8 @@ -4890,7 +4974,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4941,7 +5026,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4995,7 +5081,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5051,7 +5138,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5109,7 +5197,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5173,7 +5262,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5244,7 +5334,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -5301,7 +5392,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5354,7 +5446,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5413,7 +5506,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5479,7 +5573,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -5532,7 +5627,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5579,7 +5674,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5642,7 +5737,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5703,7 +5798,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -5748,7 +5843,8 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -5785,7 +5881,8 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: store atomic double %in, ptr %out seq_cst, align 8 @@ -5838,7 +5935,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5889,7 +5987,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index @@ -5934,7 +6033,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5991,7 +6091,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6054,7 +6155,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6117,7 +6219,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6164,7 +6267,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6216,7 +6320,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6274,7 +6379,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6332,7 +6438,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6382,7 +6489,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6439,7 +6547,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6502,7 +6611,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6565,7 +6675,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6612,7 +6723,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6664,7 +6776,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -6722,7 +6835,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6780,7 +6894,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 65046681ffc208..1914b74be1909b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -19,6 +19,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE @@ -29,6 +30,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE @@ -43,6 +45,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE @@ -53,6 +56,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE @@ -69,6 +73,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 @@ -82,6 +87,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 @@ -99,6 +105,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 @@ -112,6 +119,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index cdfc8f48349f62..95d8ca391b8438 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -58,7 +58,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -97,7 +98,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -160,8 +162,9 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -329,6 +332,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -352,6 +356,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -377,6 +382,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -400,6 +406,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir index 0ffecef1af26f2..0f20b8a2f1e29f 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx10-twoaddr-fma.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX10 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GFX10 %s # GFX10-LABEL: name: test_fmamk_reg_imm_f16 # GFX10: %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir index a197715d520b9b..91ade8806e4d1a 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s --- name: test_fmamk_reg_imm_f16 diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll new file mode 100644 index 00000000000000..5c1c4977cabb0e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -verify-machineinstrs %s -o - | FileCheck %s + +@foo_a = alias void (ptr), ptr @foo +@bar_a = alias void (ptr), ptr @foo_a + +define void @foo() { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + ret void +} + +define void @bar() { +; CHECK-LABEL: bar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s16, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, bar_a@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, bar_a@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_addk_i32 s32, 0xfc00 +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + call void @bar_a(ptr null) + ret void +} + +; UTC_ARGS: --disable +; CHECK: .set foo_a, foo +; CHECK: .set bar_a, foo_a +; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 9d8b987d2ba68c..997ba4053bb292 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -1,255 +1,1251 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX90A,GFX90A_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -stop-after=si-fix-sgpr-copies < %s | FileCheck -check-prefixes=GFX940,GFX940_DPP %s define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY5]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY6]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX90A: bb.0 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %42 + ; GFX90A_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %26, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX90A_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_ENDPGM 0 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY6]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY %41 + ; GFX940_ITERATIVE-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY8]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: S_ENDPGM 0 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow1: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %25, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[COPY7]], %bb.1, %5, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI1]] + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY9]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE2]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI1]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY13]], [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY12]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY8]], [[COPY9]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY7]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY10]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %1:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.31): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY %1 + ; GFX940_DPP-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], [[COPY12]], [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.33): + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret void } define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_ITERATIVE: bb.0 (%ir-block.0): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX90A_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX90A_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %81 + ; GFX90A_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX90A_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %91, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF %15, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX90A_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY13]], 0, [[COPY15]], [[COPY14]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX90A_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY16]], 0, [[COPY18]], [[COPY17]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.4 (%ir-block.14): + ; GFX90A_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX90A_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY20]] + ; GFX90A_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY21]] + ; GFX90A_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.5.Flow: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[COPY19]], %bb.3 + ; GFX90A_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %43, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX90A_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY22]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY23]], [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY25]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY24]] + ; GFX90A_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX90A_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY27]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY26]] + ; GFX90A_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE5]] + ; GFX90A_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE4]] + ; GFX90A_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX90A_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX90A_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX90A_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A_ITERATIVE-NEXT: {{ $}} + ; GFX90A_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY28]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX90A_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX90A_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX90A_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY31]], [[COPY32]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY30]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A_ITERATIVE-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vreg_64_align2 = COPY [[DEF9]] + ; GFX90A_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX90A_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX90A_DPP: bb.0 (%ir-block.0): + ; GFX90A_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX90A_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX90A_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX90A_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX90A_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX90A_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.1 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX90A_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX90A_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX90A_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX90A_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX90A_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX90A_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.2 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX90A_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX90A_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY15]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A_DPP-NEXT: S_BRANCH %bb.4 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.3.Flow: + ; GFX90A_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %8, %bb.4 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: S_BRANCH %bb.5 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX90A_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY14]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX90A_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY17]], implicit $exec + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: early-clobber %56:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX90A_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %56, 0, 0, implicit $mode, implicit $exec + ; GFX90A_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX90A_DPP-NEXT: [[COPY19:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX90A_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY18]], 0, [[COPY20]], [[COPY19]], implicit $exec + ; GFX90A_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX90A_DPP-NEXT: [[COPY22:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX90A_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX90A_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY21]], 0, [[COPY23]], [[COPY22]], implicit $exec + ; GFX90A_DPP-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX90A_DPP-NEXT: [[COPY24:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX90A_DPP-NEXT: S_BRANCH %bb.3 + ; GFX90A_DPP-NEXT: {{ $}} + ; GFX90A_DPP-NEXT: bb.5 (%ir-block.41): + ; GFX90A_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX90A_DPP-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX90A_DPP-NEXT: $sgpr0 = COPY [[COPY25]] + ; GFX90A_DPP-NEXT: $sgpr1 = COPY [[COPY26]] + ; GFX90A_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; + ; GFX940_ITERATIVE-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_ITERATIVE: bb.0 (%ir-block.0): + ; GFX940_ITERATIVE-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GFX940_ITERATIVE-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_ITERATIVE-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_ITERATIVE-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_ITERATIVE-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_ITERATIVE-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_ITERATIVE-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_ITERATIVE-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.1 (%ir-block.5): + ; GFX940_ITERATIVE-NEXT: successors: %bb.6(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808 + ; GFX940_ITERATIVE-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_ITERATIVE-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[COPY7]] + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.6 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.2 (%ir-block.7): + ; GFX940_ITERATIVE-NEXT: successors: %bb.3(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY %80 + ; GFX940_ITERATIVE-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY10]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.3 (%ir-block.9): + ; GFX940_ITERATIVE-NEXT: successors: %bb.5(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI %90, %bb.7, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_ITERATIVE-NEXT: SI_END_CF %15, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE2]], 0, %12, 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY14:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX940_ITERATIVE-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY13]], 0, [[COPY15]], [[COPY14]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY17:%[0-9]+]]:sreg_64_xexec = COPY %14 + ; GFX940_ITERATIVE-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_ITERATIVE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY16]], 0, [[COPY18]], [[COPY17]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]] + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.5 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.4 (%ir-block.14): + ; GFX940_ITERATIVE-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY %5.sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY %5.sub1 + ; GFX940_ITERATIVE-NEXT: $sgpr0 = COPY [[COPY20]] + ; GFX940_ITERATIVE-NEXT: $sgpr1 = COPY [[COPY21]] + ; GFX940_ITERATIVE-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.5.Flow: + ; GFX940_ITERATIVE-NEXT: successors: %bb.4(0x80000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, [[COPY19]], %bb.3 + ; GFX940_ITERATIVE-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.4 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.6.ComputeLoop: + ; GFX940_ITERATIVE-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[V_MOV_B]], %bb.1, %42, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI3:%[0-9]+]]:vreg_64_align2 = PHI [[COPY9]], %bb.1, %9, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI4:%[0-9]+]]:sreg_64 = PHI [[COPY8]], %bb.1, %11, %bb.6 + ; GFX940_ITERATIVE-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sreg_32 = S_FF1_I32_B64 [[PHI4]] + ; GFX940_ITERATIVE-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub1 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY22]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 + ; GFX940_ITERATIVE-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY23]], [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY25]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_2]], $m0, [[COPY24]] + ; GFX940_ITERATIVE-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI3]].sub0 + ; GFX940_ITERATIVE-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX940_ITERATIVE-NEXT: $m0 = COPY [[S_FF1_I32_B64_]] + ; GFX940_ITERATIVE-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY27]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READFIRSTLANE_B32_3]], $m0, [[COPY26]] + ; GFX940_ITERATIVE-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_1]], %subreg.sub0, [[V_WRITELANE_B32_]], %subreg.sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY28:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE5]] + ; GFX940_ITERATIVE-NEXT: [[COPY29:%[0-9]+]]:sreg_64 = COPY killed [[REG_SEQUENCE4]] + ; GFX940_ITERATIVE-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY29]], 0, 0, implicit $mode, implicit $exec + ; GFX940_ITERATIVE-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GFX940_ITERATIVE-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[S_MOV_B64_]], [[S_FF1_I32_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI4]], killed [[S_LSHL_B64_]], implicit-def dead $scc + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940_ITERATIVE-NEXT: S_CMP_LG_U64 [[S_ANDN2_B64_]], killed [[S_MOV_B64_1]], implicit-def $scc + ; GFX940_ITERATIVE-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.7 + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: bb.7.ComputeEnd: + ; GFX940_ITERATIVE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX940_ITERATIVE-NEXT: {{ $}} + ; GFX940_ITERATIVE-NEXT: [[PHI5:%[0-9]+]]:vreg_64_align2 = PHI [[COPY28]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[V_ADD_F64_e64_1]], %bb.6 + ; GFX940_ITERATIVE-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1 + ; GFX940_ITERATIVE-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0 + ; GFX940_ITERATIVE-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_ITERATIVE-NEXT: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY31]], [[COPY32]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY30]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_ITERATIVE-NEXT: [[COPY33:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX940_ITERATIVE-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_ITERATIVE-NEXT: [[COPY34:%[0-9]+]]:vreg_64_align2 = COPY [[DEF9]] + ; GFX940_ITERATIVE-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_ITERATIVE-NEXT: S_BRANCH %bb.2 + ; + ; GFX940_DPP-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw + ; GFX940_DPP: bb.0 (%ir-block.0): + ; GFX940_DPP-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX940_DPP-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940_DPP-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940_DPP-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX940_DPP-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX940_DPP-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940_DPP-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX940_DPP-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX940_DPP-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]] + ; GFX940_DPP-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.1 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.1 (%ir-block.5): + ; GFX940_DPP-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940_DPP-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1 + ; GFX940_DPP-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0 + ; GFX940_DPP-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940_DPP-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940_DPP-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY9]], [[COPY10]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY8]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec + ; GFX940_DPP-NEXT: [[V_SET_INACTIVE_B64_:%[0-9]+]]:vreg_64_align2 = V_SET_INACTIVE_B64 [[COPY4]], [[V_MOV_B]], implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_SET_INACTIVE_B64_]], 273, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_SET_INACTIVE_B64_]], 0, killed [[V_MOV_B1]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 0, killed [[V_MOV_B2]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_1]], 0, killed [[V_MOV_B3]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_2]], 0, killed [[V_MOV_B4]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_3]], 0, killed [[V_MOV_B5]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_4]], 0, killed [[V_MOV_B6]], 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO [[V_MOV_B]], [[V_ADD_F64_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub1 + ; GFX940_DPP-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY11]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_5]].sub0 + ; GFX940_DPP-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[COPY12]], [[S_MOV_B32_1]] + ; GFX940_DPP-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_READLANE_B32_1]], %subreg.sub0, killed [[V_READLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %2:sreg_64 = STRICT_WWM killed [[REG_SEQUENCE2]], implicit $exec + ; GFX940_DPP-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY13:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX940_DPP-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[DEF3]] + ; GFX940_DPP-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.2 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.2 (%ir-block.32): + ; GFX940_DPP-NEXT: successors: %bb.4(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940_DPP-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY %2 + ; GFX940_DPP-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY15]], [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX940_DPP-NEXT: S_BRANCH %bb.4 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.3.Flow: + ; GFX940_DPP-NEXT: successors: %bb.5(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY6]], %bb.0, %8, %bb.4 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: S_BRANCH %bb.5 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.4 (%ir-block.35): + ; GFX940_DPP-NEXT: successors: %bb.3(0x80000000) + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY14]], %bb.1, [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]], %bb.2 + ; GFX940_DPP-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940_DPP-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY16]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX940_DPP-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY17]], implicit $exec + ; GFX940_DPP-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_1]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; GFX940_DPP-NEXT: early-clobber %55:vreg_64_align2 = STRICT_WWM [[V_MOV_B7]], implicit $exec + ; GFX940_DPP-NEXT: [[V_ADD_F64_e64_6:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, killed [[REG_SEQUENCE3]], 0, killed %55, 0, 0, implicit $mode, implicit $exec + ; GFX940_DPP-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub1 + ; GFX940_DPP-NEXT: [[COPY19:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX940_DPP-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY18]], 0, [[COPY20]], [[COPY19]], implicit $exec + ; GFX940_DPP-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_6]].sub0 + ; GFX940_DPP-NEXT: [[COPY22:%[0-9]+]]:sreg_64_xexec = COPY [[COPY13]] + ; GFX940_DPP-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX940_DPP-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[COPY21]], 0, [[COPY23]], [[COPY22]], implicit $exec + ; GFX940_DPP-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940_DPP-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_CNDMASK_B32_e64_1]], %subreg.sub0, [[V_CNDMASK_B32_e64_]], %subreg.sub1 + ; GFX940_DPP-NEXT: [[COPY24:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE4]] + ; GFX940_DPP-NEXT: S_BRANCH %bb.3 + ; GFX940_DPP-NEXT: {{ $}} + ; GFX940_DPP-NEXT: bb.5 (%ir-block.41): + ; GFX940_DPP-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GFX940_DPP-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GFX940_DPP-NEXT: $sgpr0 = COPY [[COPY25]] + ; GFX940_DPP-NEXT: $sgpr1 = COPY [[COPY26]] + ; GFX940_DPP-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 52fe2342d41a82..5c4ded9a231e0d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -363,8 +365,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -545,8 +548,9 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -697,8 +701,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -852,8 +857,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1023,8 +1029,9 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1241,8 +1248,9 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1450,8 +1458,9 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1629,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1792,8 +1802,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1974,8 +1985,9 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2126,8 +2138,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2281,8 +2294,9 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2452,8 +2466,9 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2670,8 +2685,9 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2887,8 +2903,9 @@ define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3107,8 +3124,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3328,8 +3346,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3555,8 +3574,9 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3757,8 +3777,9 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3962,8 +3983,9 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4195,8 +4217,9 @@ define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4533,8 +4556,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4882,8 +4906,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5229,8 +5254,9 @@ define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5555,8 +5581,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5891,8 +5918,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6218,8 +6246,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6476,8 +6505,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6740,8 +6770,9 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7089,8 +7120,9 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7440,8 +7472,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7832,8 +7865,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8236,8 +8270,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8637,8 +8672,9 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -9017,8 +9053,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9408,8 +9445,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9790,8 +9828,9 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10110,8 +10149,9 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10436,8 +10476,9 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10840,8 +10881,9 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11207,8 +11249,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11437,8 +11480,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11669,8 +11713,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -11905,8 +11950,9 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12113,8 +12159,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12324,8 +12371,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12543,8 +12591,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -12777,8 +12826,9 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13007,8 +13057,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13335,8 +13386,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13665,8 +13717,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -13999,8 +14052,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14317,8 +14371,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14638,8 +14693,9 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -14967,8 +15023,9 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15299,8 +15356,9 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -15635,7 +15693,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX12-NEXT: .LBB58_2: ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index ae5dca4aa86fb5..4f7b6164936f83 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -182,8 +183,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -345,8 +347,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -508,8 +511,9 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -663,8 +667,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -821,8 +826,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +995,9 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1240,8 +1247,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1479,8 +1487,9 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1640,8 +1649,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1803,8 +1813,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1966,8 +1977,9 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2121,8 +2133,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2279,8 +2292,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2447,8 +2461,9 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2698,8 +2713,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2947,8 +2963,9 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3118,8 +3135,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3290,8 +3308,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3461,8 +3480,9 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3622,8 +3642,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3786,8 +3807,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3969,8 +3991,9 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4321,8 +4344,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4684,8 +4708,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5046,8 +5071,9 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5388,8 +5414,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5740,8 +5767,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6082,8 +6110,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6355,8 +6384,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6634,8 +6664,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6998,8 +7029,9 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7362,8 +7394,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7756,8 +7789,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8162,8 +8196,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8565,8 +8600,9 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8947,8 +8983,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9340,8 +9377,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9724,8 +9762,9 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10046,8 +10085,9 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10374,8 +10414,9 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10780,8 +10821,9 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11159,8 +11201,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11450,8 +11493,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11743,8 +11787,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12039,8 +12084,9 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12319,8 +12365,9 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12602,8 +12649,9 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12894,8 +12942,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13188,8 +13237,9 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13496,8 +13546,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13893,8 +13944,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14292,8 +14344,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14693,8 +14746,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15077,8 +15131,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15464,8 +15519,9 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15861,8 +15917,9 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -16260,8 +16317,9 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 915ce7433f5b0c..591e01b11bd245 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -21,8 +21,9 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -182,8 +183,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -345,8 +347,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -508,8 +511,9 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -663,8 +667,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -821,8 +826,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +995,9 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1240,8 +1247,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -1479,8 +1487,9 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1640,8 +1649,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1803,8 +1813,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1966,8 +1977,9 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2121,8 +2133,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2279,8 +2292,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2447,8 +2461,9 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 ; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2698,8 +2713,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -2947,8 +2963,9 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3118,8 +3135,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3290,8 +3308,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3461,8 +3480,9 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3622,8 +3642,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3786,8 +3807,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3969,8 +3991,9 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4321,8 +4344,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4684,8 +4708,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5046,8 +5071,9 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -5388,8 +5414,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -5740,8 +5767,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6082,8 +6110,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6355,8 +6384,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -6634,8 +6664,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6998,8 +7029,9 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -7362,8 +7394,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7756,8 +7789,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8162,8 +8196,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8565,8 +8600,9 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8947,8 +8983,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9340,8 +9377,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -9724,8 +9762,9 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10046,8 +10085,9 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10374,8 +10414,9 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -10780,8 +10821,9 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11159,8 +11201,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11450,8 +11493,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11743,8 +11787,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12039,8 +12084,9 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12319,8 +12365,9 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12602,8 +12649,9 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12894,8 +12942,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13188,8 +13237,9 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -13496,8 +13546,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -13893,8 +13944,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14292,8 +14344,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14693,8 +14746,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15077,8 +15131,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15464,8 +15519,9 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15861,8 +15917,9 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -16260,8 +16317,9 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index e26619a39bcefc..8e58f309dd9ae9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -29,8 +29,9 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -259,8 +260,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -491,8 +493,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -731,8 +734,9 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -950,8 +954,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1172,8 +1177,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1404,8 +1410,9 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1636,8 +1643,9 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -1866,8 +1874,9 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2096,8 +2105,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2328,8 +2338,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2568,8 +2579,9 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -2787,8 +2799,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3009,8 +3022,9 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3241,8 +3255,9 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3473,8 +3488,9 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -3703,8 +3719,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -3953,8 +3970,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4204,8 +4222,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4461,8 +4480,9 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4689,8 +4709,9 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -4920,8 +4941,9 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] @@ -5179,8 +5201,9 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5517,8 +5540,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5866,8 +5890,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6213,8 +6238,9 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6539,8 +6565,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -6875,8 +6902,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7202,8 +7230,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7460,8 +7489,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7724,8 +7754,9 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8073,8 +8104,9 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -8424,8 +8456,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -8816,8 +8849,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9220,8 +9254,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9621,8 +9656,9 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -10001,8 +10037,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10392,8 +10429,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -10774,8 +10812,9 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -11094,8 +11133,9 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -11420,8 +11460,9 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -11824,8 +11865,9 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -12199,8 +12241,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12473,8 +12516,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -12749,8 +12793,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13027,8 +13072,9 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13288,8 +13334,9 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13552,8 +13599,9 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -13826,8 +13874,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -14102,8 +14151,9 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -14394,8 +14444,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -14791,8 +14842,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -15190,8 +15242,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -15591,8 +15644,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -15975,8 +16029,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -16362,8 +16417,9 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -16759,8 +16815,9 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 @@ -17158,8 +17215,9 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 2f7e91faa41847..157f91ccc6b1c5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -3697,7 +3697,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba ; ; GFX12-LABEL: atomic_global_load_saddr_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3734,7 +3734,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3772,7 +3772,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre ; ; GFX12-LABEL: atomic_global_load_saddr_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog @@ -3809,7 +3809,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa ; ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll index 1102f9d0f1a5fd..790056b320d8cb 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -1466,7 +1466,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1491,7 +1492,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr ; ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1517,7 +1519,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1) ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1542,7 +1545,8 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr ; ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index dab5e991d7d43f..345b1b601d6a80 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1013,9 +1013,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = fadd double [[TMP14]], [[TMP22:%.*]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP23]] = fadd double [[ACCUMULATOR]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd double [[TMP30]], [[TMP31]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret double %result @@ -1089,9 +1169,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -1165,9 +1325,89 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1206,9 +1446,89 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s } define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP14]], double [[TMP22:%.*]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.minnum.f64(double [[TMP30]], double [[TMP31]]) +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1274,9 +1594,89 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ } define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP30]], double [[TMP31]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret double %result @@ -1350,9 +1750,89 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ } define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret double [[RESULT]] +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP13]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[TMP22:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = select i1 [[TMP28:%.*]], double [[TMP14]], double [[TMP15]] +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP16]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP18]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP23]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP22]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP26:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP20]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22]] = call double @llvm.amdgcn.writelane.f64(double [[ACCUMULATOR]], i32 [[TMP20]], double [[OLDVALUEPHI]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = shl i64 1, [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = xor i64 [[TMP24]], -1 +; IR-ITERATIVE-NEXT: [[TMP26]] = and i64 [[ACTIVEBITS]], [[TMP25]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP27]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP28]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP28]], label [[TMP10]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP34:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP21]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP23]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP25:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP25]], label [[TMP26:%.*]], label [[TMP28:%.*]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP24]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: [[TMP29:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP27]], [[TMP26]] ] +; IR-DPP-NEXT: [[TMP30:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[TMP29]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP31:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP32:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP30]], double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP33:%.*]] = select i1 [[TMP25]], double [[TMP30]], double [[TMP32]] +; IR-DPP-NEXT: br label [[TMP34]] +; IR-DPP: 34: +; IR-DPP-NEXT: [[TMP35:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP33]], [[TMP28]] ] +; IR-DPP-NEXT: ret double [[TMP35]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 3bf52a56fef5b5..40f0acf3d5d09f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -49,7 +49,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -120,7 +121,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -198,7 +200,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -279,7 +282,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -343,7 +347,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -413,7 +418,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -488,7 +494,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -566,7 +573,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -625,7 +633,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -696,7 +705,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -774,7 +784,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -855,7 +866,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -919,7 +931,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -989,7 +1002,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1064,7 +1078,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1142,7 +1157,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1201,7 +1217,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1272,7 +1289,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1350,7 +1368,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1431,7 +1450,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1495,7 +1515,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1565,7 +1586,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1640,7 +1662,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -1718,7 +1741,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1771,7 +1795,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1839,7 +1864,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -1911,7 +1937,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -1989,7 +2016,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2047,7 +2075,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2114,7 +2143,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2183,7 +2213,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2258,7 +2289,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2311,7 +2343,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2379,7 +2412,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2451,7 +2485,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2529,7 +2564,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2587,7 +2623,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2654,7 +2691,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2723,7 +2761,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2798,7 +2837,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2851,7 +2891,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -2919,7 +2960,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -2991,7 +3033,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3069,7 +3112,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3127,7 +3171,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3194,7 +3239,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3263,7 +3309,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3338,7 +3385,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3391,7 +3439,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3459,7 +3508,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3531,7 +3581,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3609,7 +3660,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3667,7 +3719,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3734,7 +3787,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3803,7 +3857,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm @@ -3878,7 +3933,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -3937,7 +3993,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4008,7 +4065,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4086,7 +4144,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4167,7 +4226,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4231,7 +4291,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4301,7 +4362,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4376,7 +4438,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4454,7 +4517,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4513,7 +4577,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4568,7 +4633,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4623,7 +4689,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4694,7 +4761,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4772,7 +4840,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4853,7 +4922,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -4917,7 +4987,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -4987,7 +5058,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5062,7 +5134,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5140,7 +5213,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5199,7 +5273,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5270,7 +5345,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5348,7 +5424,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5429,7 +5506,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5493,7 +5571,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5563,7 +5642,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5638,7 +5718,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5716,7 +5797,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -5791,7 +5873,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5864,7 +5947,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5937,7 +6021,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] @@ -6019,7 +6104,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6112,7 +6198,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] @@ -6189,7 +6276,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6261,7 +6349,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3] @@ -6340,7 +6429,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6430,7 +6520,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] @@ -6496,7 +6587,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6563,7 +6654,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6626,7 +6717,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6707,7 +6798,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6788,7 +6879,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6870,7 +6961,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -6928,7 +7019,8 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6979,7 +7071,8 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7047,7 +7140,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7115,7 +7209,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7184,7 +7279,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7240,7 +7336,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7311,7 +7408,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -7389,7 +7487,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7445,7 +7544,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -7516,7 +7616,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -7594,7 +7695,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] -; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index dfc831cb5050a2..c89be8063d9a8c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -852,9 +852,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = fadd double [[ACCUMULATOR]], [[TMP16]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = fadd double [[TMP11]], [[TMP12]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd double [[TMP13]], [[TMP14]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd double [[TMP15]], [[TMP16]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = fadd double [[TMP17]], [[TMP18]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = fadd double [[TMP19]], [[TMP20]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 ret void @@ -914,9 +980,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret void @@ -976,9 +1108,75 @@ define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1010,9 +1208,75 @@ define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { -; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.minnum.f64(double [[ACCUMULATOR]], double [[TMP16]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.minnum.f64(double [[TMP9]], double [[TMP10]]) +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.minnum.f64(double [[TMP11]], double [[TMP12]]) +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.minnum.f64(double [[TMP13]], double [[TMP14]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.minnum.f64(double [[TMP15]], double [[TMP16]]) +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.minnum.f64(double [[TMP17]], double [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.minnum.f64(double [[TMP19]], double [[TMP20]]) +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1064,9 +1328,75 @@ define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_sco } define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp(ptr addrspace(1) inreg %ptr, double %val) #1{ -; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ 0x7FF8000000000000, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.maxnum.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double 0x7FF8000000000000) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP9]], double [[TMP10]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP11]], double [[TMP12]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP13]], double [[TMP14]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP15]], double [[TMP16]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP17]], double [[TMP18]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double 0x7FF8000000000000, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP19]], double [[TMP20]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void @@ -1126,9 +1456,75 @@ define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_sc } define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { -; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( -; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 -; IR-NEXT: ret void +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP17:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi double [ -0.000000e+00, [[TMP2]] ], [ [[TMP17]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP20:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[VAL:%.*]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17]] = call double @llvm.experimental.constrained.fadd.f64(double [[ACCUMULATOR]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = xor i64 [[TMP18]], -1 +; IR-ITERATIVE-NEXT: [[TMP20]] = and i64 [[ACTIVEBITS]], [[TMP19]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP22]], label [[TMP10:%.*]], label [[TMP12]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP28:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call double @llvm.amdgcn.set.inactive.f64(double [[VAL:%.*]], double -0.000000e+00) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP9]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP9]], double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP11]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP11]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP14:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP13]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP13]], double [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP16:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP15]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP17:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP15]], double [[TMP16]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP17]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP17]], double [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = call double @llvm.amdgcn.update.dpp.f64(double -0.000000e+00, double [[TMP19]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP21:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP19]], double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[TMP21]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.amdgcn.strict.wwm.f64(double [[TMP22]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP27:%.*]] +; IR-DPP: 25: +; IR-DPP-NEXT: [[TMP26:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP23]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP27]] +; IR-DPP: 27: +; IR-DPP-NEXT: br label [[TMP28]] +; IR-DPP: 28: +; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index ab32efc4d3cd8e..c05f9c679979da 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -191,6 +191,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -337,19 +373,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -361,27 +395,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -676,6 +734,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1305,6 +1413,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -1517,19 +1671,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1541,27 +1693,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1882,6 +2058,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2537,6 +2763,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -2749,19 +3021,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2773,27 +3043,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -3088,6 +3382,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB5_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3471,19 +3815,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3495,27 +3837,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -3810,6 +4176,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -4439,6 +4855,52 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: .LBB7_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -4650,19 +5112,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -4674,27 +5134,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: @@ -5015,6 +5499,56 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5869,6 +6403,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: .LBB9_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6257,9 +6871,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6269,25 +6880,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -6310,7 +6945,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -6324,8 +6959,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6347,31 +6982,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6388,7 +7043,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6406,8 +7061,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6429,31 +7084,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -6462,7 +7137,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -6489,8 +7164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6512,31 +7187,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -6545,7 +7239,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6572,8 +7266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6586,9 +7280,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6596,29 +7289,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -6644,8 +7361,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -6660,7 +7377,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -6671,24 +7387,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -6710,11 +7451,97 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6734,31 +7561,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6775,12 +7657,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6791,10 +7673,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6816,31 +7698,78 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6849,7 +7778,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -6876,8 +7805,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6899,31 +7828,72 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6932,7 +7902,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -6959,8 +7929,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6973,9 +7943,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6983,29 +7952,87 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -7031,8 +8058,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7047,7 +8074,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7058,24 +8084,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7097,8 +8173,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() @@ -7364,6 +8440,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: .LBB11_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -7579,19 +8704,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7603,30 +8726,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7637,10 +8786,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7655,24 +8804,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7682,43 +8854,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7728,117 +8923,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -7847,10 +9170,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7865,26 +9188,85 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-DPP-NEXT: s_endpgm -; +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -7892,43 +9274,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7938,115 +9370,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -8311,6 +9899,55 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -8526,19 +10163,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8550,30 +10185,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8584,10 +10245,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8602,24 +10263,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8629,43 +10313,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8675,117 +10382,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: -; GFX1164: ; %bb.0: +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -8794,10 +10629,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8812,24 +10647,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8839,43 +10733,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: @@ -8885,115 +10829,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9004,19 +11104,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9028,30 +11126,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9062,10 +11186,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9080,70 +11204,116 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9153,117 +11323,245 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9272,10 +11570,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9290,24 +11588,83 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9317,43 +11674,93 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9363,115 +11770,271 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9944,6 +12507,88 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: .LBB16_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10346,9 +12991,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -10358,25 +13000,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -10399,7 +13065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -10413,8 +13079,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10436,31 +13102,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -10477,7 +13163,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -10495,8 +13181,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10518,31 +13204,51 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -10551,7 +13257,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -10578,8 +13284,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10601,31 +13307,50 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -10634,7 +13359,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -10661,8 +13386,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10675,9 +13400,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -10685,29 +13409,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -10733,8 +13481,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -10749,7 +13497,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -10760,24 +13507,49 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -10799,11 +13571,97 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10823,31 +13681,86 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -10864,12 +13777,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -10880,10 +13793,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10905,31 +13818,78 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -10938,7 +13898,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -10965,8 +13925,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -10988,31 +13948,72 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11021,7 +14022,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11048,8 +14049,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: @@ -11062,9 +14063,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -11072,29 +14072,87 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -11120,8 +14178,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11136,7 +14194,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -11147,24 +14204,74 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -11186,8 +14293,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -11397,6 +14504,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-DPP-NEXT: .LBB18_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -11767,6 +14910,42 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-DPP-NEXT: .LBB19_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index a13e704a1a5fc8..46f0bb03938857 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: @@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: @@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: @@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3763,6 +4090,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: .LBB6_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4135,9 +4539,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4158,17 +4559,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4195,14 +4625,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4224,7 +4654,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4238,16 +4667,41 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4255,8 +4709,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -4277,14 +4731,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4306,7 +4760,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4320,28 +4773,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4352,22 +4830,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4389,7 +4867,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4403,23 +4880,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -4443,14 +4944,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4463,9 +4964,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4476,18 +4976,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4495,8 +5024,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4506,22 +5035,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -4536,7 +5065,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -4547,18 +5075,48 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4566,33 +5124,120 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4612,7 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4626,16 +5270,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4643,36 +5352,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4694,7 +5403,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4708,30 +5416,86 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4740,22 +5504,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4777,7 +5543,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4791,28 +5556,78 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4823,22 +5638,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: @@ -4851,9 +5666,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4864,18 +5678,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4883,7 +5768,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4892,24 +5777,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4924,7 +5809,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4935,24 +5819,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4961,23 +5904,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5180,6 +6123,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: .LBB8_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5344,19 +6324,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5368,14 +6346,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5385,15 +6391,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5404,10 +6410,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5422,26 +6428,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5451,45 +6482,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5499,125 +6555,263 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5626,10 +6820,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5644,26 +6838,92 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5673,45 +6933,103 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: @@ -5721,123 +7039,303 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -6275,6 +7773,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: .LBB10_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6647,9 +8222,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6670,17 +8242,46 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6707,14 +8308,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6736,7 +8337,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -6750,16 +8350,41 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6767,8 +8392,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -6789,14 +8414,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6818,7 +8443,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6832,28 +8456,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -6864,22 +8513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6901,7 +8550,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6915,23 +8563,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6955,14 +8627,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -6975,9 +8647,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6988,18 +8659,47 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -7007,8 +8707,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7018,22 +8718,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -7048,7 +8748,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7059,18 +8758,48 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7078,33 +8807,120 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -7124,7 +8940,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -7138,16 +8953,81 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7155,36 +9035,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7206,7 +9086,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7220,30 +9099,86 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7252,22 +9187,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7289,7 +9226,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7303,28 +9239,78 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7335,22 +9321,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: @@ -7363,9 +9349,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7376,18 +9361,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7395,7 +9451,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7404,24 +9460,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7436,7 +9492,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7447,24 +9502,83 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7473,23 +9587,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -7680,6 +9794,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: .LBB12_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8015,6 +10162,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 65d0b9eafdf820..bd5e589ec2be7f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -145,6 +145,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -248,19 +281,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -272,14 +303,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -287,14 +344,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: @@ -577,6 +634,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1153,6 +1262,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1310,19 +1452,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1334,14 +1474,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -1349,14 +1515,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: @@ -1698,6 +1864,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2335,6 +2553,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2491,19 +2742,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2515,14 +2764,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 @@ -2530,14 +2805,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: @@ -2879,6 +3154,58 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3763,6 +4090,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-DPP-NEXT: .LBB6_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4135,9 +4539,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -4158,17 +4559,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4195,14 +4625,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 +; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4224,7 +4654,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4238,16 +4667,41 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB7_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4255,8 +4709,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -4277,14 +4731,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4306,7 +4760,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4320,28 +4773,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB7_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4352,22 +4830,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4389,7 +4867,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4403,23 +4880,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB7_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -4443,14 +4944,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4463,9 +4964,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4476,18 +4976,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB7_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4495,8 +5024,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4506,22 +5035,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -4536,7 +5065,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -4547,18 +5075,48 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4566,33 +5124,120 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 +; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -4612,7 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -4626,16 +5270,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4643,36 +5352,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4694,7 +5403,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4708,30 +5416,86 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -4740,22 +5504,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4777,7 +5543,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -4791,28 +5556,78 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4823,22 +5638,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: @@ -4851,9 +5666,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -4864,18 +5678,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -4883,7 +5768,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4892,24 +5777,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4924,7 +5809,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -4935,24 +5819,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -4961,23 +5904,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5180,6 +6123,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-DPP-NEXT: .LBB8_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5344,19 +6324,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5368,14 +6346,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -5385,15 +6391,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 +; GFX7LESS-NEXT: .LBB9_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5404,10 +6410,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5422,26 +6428,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB9_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5451,45 +6482,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB9_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1064-NEXT: .LBB9_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5499,125 +6555,263 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB9_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1032-NEXT: .LBB9_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB9_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1164-NEXT: .LBB9_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_4 +; GFX1132-NEXT: .LBB9_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -5626,10 +6820,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -5644,26 +6838,92 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[1:2], s[0:1], s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX9-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[1:2] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5673,45 +6933,103 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1064-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[9:10], v[0:1], v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: v_max_f64 v[13:14], v[11:12], v[11:12] +; GFX1064-DPP-NEXT: v_min_f64 v[9:10], v[13:14], v[9:10] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: @@ -5721,123 +7039,303 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] -; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] +; GFX1032-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: v_max_f64 v[9:10], v[11:12], v[11:12] +; GFX1032-DPP-NEXT: v_min_f64 v[9:10], v[9:10], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1164-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] -; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[10:11] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -6275,6 +7773,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-DPP-NEXT: .LBB10_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6647,9 +8222,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6670,17 +8242,46 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -6707,14 +8308,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 +; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6736,7 +8337,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -6750,16 +8350,41 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: .LBB11_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB11_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6767,8 +8392,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -6789,14 +8414,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6818,7 +8443,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6832,28 +8456,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB11_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -6864,22 +8513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6901,7 +8550,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -6915,23 +8563,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB11_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6955,14 +8627,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -6975,9 +8647,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6988,18 +8659,47 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB11_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -7007,8 +8707,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7018,22 +8718,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -7048,7 +8748,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -7059,18 +8758,48 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB11_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7078,33 +8807,120 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v3, s43 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s41 ; GFX1132-NEXT: s_mov_b32 s13, s40 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s42 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -7124,7 +8940,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] @@ -7138,16 +8953,81 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7155,36 +9035,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7206,7 +9086,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7220,30 +9099,86 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1064-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_max_f64 v[8:9], s[4:5], s[4:5] +; GFX1064-DPP-NEXT: v_max_f64 v[10:11], s[2:3], s[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -7252,22 +9187,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7289,7 +9226,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] @@ -7303,28 +9239,78 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1032-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7335,22 +9321,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: @@ -7363,9 +9349,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7376,18 +9361,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1164-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1164-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7395,7 +9451,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7404,24 +9460,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7436,7 +9492,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7447,24 +9502,83 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7473,23 +9587,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -7680,6 +9794,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-DPP-NEXT: .LBB12_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8015,6 +10162,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 2bba8d4f43b1a8..5ffa71d37164c3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s @@ -217,6 +217,42 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-DPP-NEXT: .LBB0_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec @@ -389,19 +425,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -413,27 +447,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 +; GFX7LESS-NEXT: .LBB1_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -754,6 +812,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -1409,6 +1517,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-DPP-NEXT: .LBB2_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -1621,19 +1775,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1645,27 +1797,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 +; GFX7LESS-NEXT: .LBB3_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1986,6 +2162,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -2641,6 +2867,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-DPP-NEXT: .LBB4_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -2853,19 +3125,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2877,27 +3147,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 +; GFX7LESS-NEXT: .LBB5_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -3218,6 +3512,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -3627,19 +3971,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3651,27 +3993,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 +; GFX7LESS-NEXT: .LBB6_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -3992,6 +4358,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -4647,6 +5063,52 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-DPP-NEXT: .LBB7_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -4858,19 +5320,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -4882,27 +5342,51 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 +; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 +; GFX7LESS-NEXT: .LBB8_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: @@ -5223,6 +5707,56 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -6077,6 +6611,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-DPP-NEXT: .LBB9_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6465,9 +7079,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -6477,25 +7088,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -6518,7 +7153,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -6532,8 +7167,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 +; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6555,31 +7190,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -6596,7 +7251,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6614,8 +7269,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6637,31 +7292,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -6670,7 +7345,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -6697,8 +7372,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6720,31 +7395,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB10_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -6753,7 +7447,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -6780,8 +7474,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -6794,9 +7488,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -6804,29 +7497,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -6852,8 +7569,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -6868,7 +7585,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -6879,24 +7595,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -6918,11 +7659,97 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -6942,31 +7769,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -6983,12 +7865,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -6999,10 +7881,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7024,31 +7906,78 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7057,7 +7986,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -7084,8 +8013,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7107,40 +8036,81 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] -; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -7167,8 +8137,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: @@ -7181,9 +8151,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -7191,29 +8160,87 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -7239,8 +8266,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7255,7 +8282,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -7266,24 +8292,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7305,8 +8381,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() @@ -7572,6 +8648,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-DPP-NEXT: .LBB11_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -7786,19 +8911,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7810,30 +8933,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7LESS-NEXT: .LBB12_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7844,10 +8993,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -7862,24 +9011,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB12_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB12_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_4 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7889,43 +9061,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB12_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1064-NEXT: .LBB12_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -7935,117 +9130,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB12_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1032-NEXT: .LBB12_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB12_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1164-NEXT: .LBB12_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1132-NEXT: .LBB12_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -8054,10 +9377,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8072,24 +9395,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -8099,43 +9481,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -8145,115 +9577,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic @@ -8518,6 +10106,55 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-DPP-NEXT: .LBB13_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -8733,19 +10370,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8757,30 +10392,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 +; GFX7LESS-NEXT: .LBB14_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8791,10 +10452,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -8809,24 +10470,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB14_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_4 +; GFX9-NEXT: .LBB14_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8836,43 +10520,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB14_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1064-NEXT: .LBB14_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -8882,117 +10589,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB14_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1032-NEXT: .LBB14_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB14_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1164-NEXT: .LBB14_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB14_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_4 +; GFX1132-NEXT: .LBB14_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9001,10 +10836,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 @@ -9019,24 +10854,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX9-DPP-NEXT: .LBB14_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -9046,43 +10940,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1064-DPP-NEXT: .LBB14_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: @@ -9092,115 +11036,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1032-DPP-NEXT: .LBB14_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1164-DPP-NEXT: .LBB14_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2 +; GFX1132-DPP-NEXT: .LBB14_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -9211,19 +11311,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b32 s32, 0 -; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s42, -1 -; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 -; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 ; GFX7LESS-NEXT: s_mov_b32 s14, s8 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3] ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9235,30 +11333,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7LESS-NEXT: .LBB15_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9269,10 +11393,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9287,24 +11411,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB15_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9314,43 +11461,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB15_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1064-NEXT: .LBB15_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9360,117 +11530,245 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 -; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB15_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1032-NEXT: .LBB15_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB15_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1164-NEXT: .LBB15_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 -; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB15_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] +; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_4 +; GFX1132-NEXT: .LBB15_5: ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -9479,10 +11777,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -9497,24 +11795,83 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3] +; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -s[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-DPP-NEXT: .LBB15_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9524,43 +11881,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-DPP-NEXT: .LBB15_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: @@ -9570,115 +11977,271 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1] +; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-DPP-NEXT: .LBB15_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2 +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-DPP-NEXT: .LBB15_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1] +; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-DPP-NEXT: .LBB15_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic @@ -10150,6 +12713,88 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 +; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-DPP-NEXT: ; %bb.1: +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 +; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-DPP-NEXT: .LBB16_3: +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -10552,9 +13197,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 -; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] @@ -10564,25 +13206,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 +; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 +; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5 +; GFX7LESS-NEXT: ; %bb.3: +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 -; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 @@ -10605,7 +13271,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s12, s41 ; GFX7LESS-NEXT: s_mov_b32 s13, s40 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) @@ -10619,8 +13285,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10642,31 +13308,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s40, s7 ; GFX9-NEXT: s_mov_b32 s41, s6 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9-NEXT: v_readlane_b32 s3, v1, s4 +; GFX9-NEXT: v_readlane_b32 s2, v0, s4 +; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB17_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -10683,7 +13369,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s12, s41 ; GFX9-NEXT: s_mov_b32 s13, s40 ; GFX9-NEXT: s_mov_b32 s14, s33 -; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -10701,8 +13387,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10724,31 +13410,51 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s40, s7 ; GFX1064-NEXT: s_mov_b32 s41, s6 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b32 s12, s41 ; GFX1064-NEXT: s_mov_b32 s13, s40 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB17_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] @@ -10757,7 +13463,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42 @@ -10784,8 +13490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10807,31 +13513,50 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s40, s7 ; GFX1032-NEXT: s_mov_b32 s41, s6 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b32 s12, s41 ; GFX1032-NEXT: s_mov_b32 s13, s40 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB17_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] @@ -10840,7 +13565,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42 @@ -10867,8 +13592,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -10881,9 +13606,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -10891,29 +13615,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: s_mov_b32 s40, s7 ; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] +; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 +; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB17_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 @@ -10939,8 +13687,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm ; @@ -10955,7 +13703,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-NEXT: s_mov_b32 s40, s14 ; GFX1132-NEXT: s_mov_b32 s41, s13 @@ -10966,24 +13713,49 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB17_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 @@ -11005,11 +13777,97 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 +; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm ; +; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: +; GFX7LESS-DPP: ; %bb.0: +; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-DPP-NEXT: s_endpgm +; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 @@ -11029,31 +13887,86 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s40, s7 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11070,12 +13983,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s12, s41 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 -; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -11086,10 +13999,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11111,31 +14024,78 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32 +; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11144,7 +14104,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11171,8 +14131,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11194,31 +14154,72 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -11227,7 +14228,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 @@ -11254,8 +14255,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: @@ -11268,9 +14269,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -11278,29 +14278,87 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8 +; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 -; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 @@ -11326,8 +14384,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11342,7 +14400,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 @@ -11353,24 +14410,74 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1 +; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 -; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -11392,8 +14499,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 +; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 1f92427fe8a237..f095aef7a0cc81 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -543,6 +543,24 @@ bb8: ; preds = %bb2 ret void } +; GCN-LABEL: {{^}}insert_or_disj_index: +; GCN: v_mov_b32_e32 v[[#VIDX:]], 0 + +; MOVREL: s_mov_b32 m0, s{{[0-9]+}} +; MOVREL: v_movreld_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} + +; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) +; IDXMODE: v_mov_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} +; IDXMODE: s_set_gpr_idx_off +define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) { +entry: + %idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0) + %off = or disjoint i32 %idx, 1 + %v = insertelement <16 x i32> zeroinitializer, i32 %val, i32 %off + store <16 x i32> %v, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare void @llvm.amdgcn.s.barrier() #2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index aca4730122f901..b9dc27cb7e0192 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -130,8 +130,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -290,8 +291,9 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v2, v3 ; GFX12-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 @@ -806,6 +808,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, 5 ; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -874,8 +877,9 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1051,6 +1055,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s1, s1, 5 ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1655,4 +1660,3 @@ entry: %bc = bitcast <2 x i32> %r.1 to <2 x float> ret <2 x float> %bc } - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 270ab5fee1125e..824d3708c027db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s @@ -10,10 +10,10 @@ ; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}fold_wavefrontsize: ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index d8a790c7184084..65614a17fc0114 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -22,6 +22,7 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -137,6 +138,7 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -253,6 +255,7 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -366,6 +369,7 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -493,6 +497,7 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -683,6 +688,7 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -871,6 +877,7 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1052,6 +1059,7 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1253,6 +1261,7 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1558,6 +1567,7 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1870,6 +1880,7 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2163,6 +2174,7 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2456,6 +2468,7 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2690,6 +2703,7 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2938,6 +2952,7 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3291,6 +3306,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3651,6 +3667,7 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3992,6 +4009,7 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4333,6 +4351,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4622,6 +4641,7 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4889,6 +4909,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5112,6 +5133,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5335,6 +5357,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5549,6 +5572,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5769,6 +5793,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6076,6 +6101,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6384,6 +6410,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6681,6 +6708,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6996,6 +7024,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_lshl_b32 s5, s1, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7015,6 +7044,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_lshl_b32 s0, s1, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7051,6 +7081,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7613,9 +7644,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 -; GFX7-NEXT: ; %bb.3: ; %Flow18 +; GFX7-NEXT: ; %bb.3: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB28_4: ; %Flow19 +; GFX7-NEXT: .LBB28_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -7643,32 +7674,64 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 -; GFX7-NEXT: .LBB28_7: ; %Flow17 +; GFX7-NEXT: .LBB28_7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX7-NEXT: ds_read_b32 v0, v1 -; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB28_8: ; %atomicrmw.start8 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: .LBB28_8: ; %ComputeLoop +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s8, v1 +; GFX7-NEXT: v_readlane_b32 s9, v2, s5 +; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_writelane_b32 v0, s8, m0 +; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7-NEXT: s_cbranch_vccnz .LBB28_8 +; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_cbranch_execz .LBB28_13 +; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB28_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB28_11 +; GFX7-NEXT: ; %bb.12: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB28_13: ; %Flow19 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -7705,9 +7768,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 -; GFX6-NEXT: ; %bb.3: ; %Flow16 +; GFX6-NEXT: ; %bb.3: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB28_4: ; %Flow17 +; GFX6-NEXT: .LBB28_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -7735,32 +7798,64 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 -; GFX6-NEXT: .LBB28_7: ; %Flow15 +; GFX6-NEXT: .LBB28_7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: .LBB28_8: ; %atomicrmw.start8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: .LBB28_8: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readlane_b32 s9, v2, s5 +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_writelane_b32 v0, s8, m0 +; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_cbranch_vccnz .LBB28_8 +; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: ; implicit-def: $vgpr2 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB28_13 +; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_cbranch_execnz .LBB28_8 -; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB28_11 +; GFX6-NEXT: ; %bb.12: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: .LBB28_13: ; %Flow17 +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8389,9 +8484,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 -; GFX7-NEXT: ; %bb.3: ; %Flow18 +; GFX7-NEXT: ; %bb.3: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB29_4: ; %Flow19 +; GFX7-NEXT: .LBB29_4: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -8419,32 +8514,64 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 -; GFX7-NEXT: .LBB29_7: ; %Flow17 +; GFX7-NEXT: .LBB29_7: ; %Flow21 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX7-NEXT: ds_read_b32 v0, v1 -; GFX7-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: .LBB29_8: ; %atomicrmw.start8 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: .LBB29_8: ; %ComputeLoop +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s8, v1 +; GFX7-NEXT: v_readlane_b32 s9, v2, s5 +; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_writelane_b32 v0, s8, m0 +; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7-NEXT: s_cbranch_vccnz .LBB29_8 +; GFX7-NEXT: ; %bb.9: ; %ComputeEnd +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_cbranch_execz .LBB29_13 +; GFX7-NEXT: ; %bb.10: +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB29_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB29_11 +; GFX7-NEXT: ; %bb.12: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB29_13: ; %Flow19 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -8481,9 +8608,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 -; GFX6-NEXT: ; %bb.3: ; %Flow16 +; GFX6-NEXT: ; %bb.3: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB29_4: ; %Flow17 +; GFX6-NEXT: .LBB29_4: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -8511,32 +8638,64 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 -; GFX6-NEXT: .LBB29_7: ; %Flow15 +; GFX6-NEXT: .LBB29_7: ; %Flow19 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v0 -; GFX6-NEXT: ds_read_b32 v0, v1 -; GFX6-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: .LBB29_8: ; %atomicrmw.start8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: .LBB29_8: ; %ComputeLoop +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readlane_b32 s9, v2, s5 +; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_writelane_b32 v0, s8, m0 +; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX6-NEXT: s_cbranch_vccnz .LBB29_8 +; GFX6-NEXT: ; %bb.9: ; %ComputeEnd +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX6-NEXT: ; implicit-def: $vgpr2 +; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_cbranch_execz .LBB29_13 +; GFX6-NEXT: ; %bb.10: +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v3 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_cbranch_execnz .LBB29_8 -; GFX6-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB29_11 +; GFX6-NEXT: ; %bb.12: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: .LBB29_13: ; %Flow17 +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8561,6 +8720,7 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -8676,6 +8836,7 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_add_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 9a6ac12702a5da..6dec36c316ee31 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -111,6 +112,7 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -202,6 +204,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -291,6 +294,7 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -387,6 +391,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -484,6 +489,7 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -583,6 +589,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -680,6 +687,7 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -802,6 +810,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1114,6 +1123,7 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1433,6 +1443,7 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1734,6 +1745,7 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2035,6 +2047,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2277,6 +2290,7 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2532,6 +2546,7 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2887,6 +2902,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3249,6 +3265,7 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3592,6 +3609,7 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3935,6 +3953,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4226,6 +4245,7 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4505,6 +4525,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4775,6 +4796,7 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5044,6 +5066,7 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5304,6 +5327,7 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5588,6 +5612,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5963,6 +5988,7 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6337,6 +6363,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6699,6 +6726,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7043,6 +7071,7 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7132,6 +7161,7 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 4bc9404074c4ac..b3132a2fa80dd2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -111,6 +112,7 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -202,6 +204,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -291,6 +294,7 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -387,6 +391,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -484,6 +489,7 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -583,6 +589,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -680,6 +687,7 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -802,6 +810,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1114,6 +1123,7 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1433,6 +1443,7 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1734,6 +1745,7 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2035,6 +2047,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2277,6 +2290,7 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2532,6 +2546,7 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2887,6 +2902,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3249,6 +3265,7 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3592,6 +3609,7 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3935,6 +3953,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4226,6 +4245,7 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4505,6 +4525,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4775,6 +4796,7 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 ; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5044,6 +5066,7 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5304,6 +5327,7 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5588,6 +5612,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5963,6 +5988,7 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6337,6 +6363,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6699,6 +6726,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7043,6 +7071,7 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7132,6 +7161,7 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 760bb5253f3dd1..5ebeddd04b2ae8 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -29,6 +29,7 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -239,6 +240,7 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -448,6 +450,7 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -647,6 +650,7 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -855,6 +859,7 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1070,6 +1075,7 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1283,6 +1289,7 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1487,6 +1494,7 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -1711,6 +1719,7 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2016,6 +2025,7 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2328,6 +2338,7 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2621,6 +2632,7 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -2914,6 +2926,7 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3148,6 +3161,7 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3396,6 +3410,7 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -3749,6 +3764,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4109,6 +4125,7 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4450,6 +4467,7 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4791,6 +4809,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5080,6 +5099,7 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,6 +5375,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5608,6 +5629,7 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5859,6 +5881,7 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6100,6 +6123,7 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6368,6 +6392,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6743,6 +6768,7 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 ; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7117,6 +7143,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7479,6 +7506,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7830,6 +7858,7 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -8038,6 +8067,7 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 7a68fb5ce508cf..6b7a6fb27fadfa 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -138,6 +138,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -209,6 +210,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -281,6 +283,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -421,6 +424,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -492,6 +496,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -564,6 +569,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -739,6 +745,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -747,6 +754,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -835,6 +843,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -844,6 +853,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -933,6 +943,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -942,6 +953,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1113,6 +1125,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1121,6 +1134,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1209,6 +1223,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1218,6 +1233,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1307,6 +1323,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1316,6 +1333,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1491,6 +1509,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1499,6 +1518,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1591,6 +1611,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1600,6 +1621,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1693,6 +1715,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1702,6 +1725,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1877,6 +1901,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-WGP-LABEL: system_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1885,6 +1910,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1977,6 +2003,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1986,6 +2013,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2079,6 +2107,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2088,6 +2117,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index e1794dbed32fc2..f564f8e4e0d67f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -137,12 +137,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"local"} @@ -205,12 +203,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -273,12 +269,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"local"} @@ -637,12 +631,10 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"local"} @@ -705,12 +697,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -773,12 +763,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"local"} @@ -1137,12 +1125,10 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-as", !"local"} @@ -1205,12 +1191,10 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-as", !"local"} @@ -1273,12 +1257,10 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-as", !"local"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 4128bfe392dc75..af7c66a2bd2cd9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1065,6 +1065,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1144,6 +1145,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1224,6 +1226,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1365,6 +1368,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1436,6 +1440,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1508,6 +1513,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1683,6 +1689,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1691,6 +1698,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -1779,6 +1787,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1788,6 +1797,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -1877,6 +1887,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1886,6 +1897,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2057,6 +2069,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2065,6 +2078,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2153,6 +2167,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2162,6 +2177,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2251,6 +2267,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2260,6 +2277,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2435,6 +2453,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2443,6 +2462,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2535,6 +2555,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2544,6 +2565,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2637,6 +2659,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2646,6 +2669,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -2821,6 +2845,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-WGP-LABEL: system_one_as_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2829,6 +2854,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2921,6 +2947,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2930,6 +2957,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3023,6 +3051,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3032,6 +3061,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 193f642cb1fa10..45e8b3bcff13c5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -552,7 +552,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -570,7 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -774,7 +774,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -796,7 +796,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_store: @@ -1084,7 +1084,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1229,11 +1229,12 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_store: @@ -1244,11 +1245,12 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1393,11 +1395,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_seq_cst_store: @@ -1408,11 +1411,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1552,7 +1556,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw: @@ -1564,7 +1568,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1731,7 +1735,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -1745,7 +1749,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -1903,11 +1907,12 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_atomicrmw: @@ -1919,11 +1924,12 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2106,11 +2112,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2124,11 +2131,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2313,11 +2321,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2331,11 +2340,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2515,7 +2525,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2533,7 +2543,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2735,11 +2745,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2757,11 +2768,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2963,11 +2975,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2985,11 +2998,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3211,7 +3225,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: @@ -3227,7 +3241,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -3469,7 +3483,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -3487,7 +3501,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -3720,11 +3734,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg: @@ -3740,11 +3755,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4002,11 +4018,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4024,11 +4041,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4288,11 +4306,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4310,11 +4329,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4558,7 +4578,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4576,7 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4820,7 +4840,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4838,7 +4858,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5098,11 +5118,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5120,11 +5141,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5384,11 +5406,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5406,11 +5429,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5670,11 +5694,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5692,11 +5717,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5956,11 +5982,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5978,11 +6005,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6242,11 +6270,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6264,11 +6293,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6528,11 +6558,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6550,11 +6581,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6814,11 +6846,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6836,11 +6869,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7100,11 +7134,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -7122,11 +7157,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7375,7 +7411,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7395,7 +7431,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7661,7 +7697,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7684,7 +7720,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7956,11 +7992,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7980,11 +8017,12 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8266,11 +8304,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8293,11 +8332,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8582,11 +8622,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8609,11 +8650,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8882,7 +8924,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8905,7 +8947,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9174,7 +9216,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9197,7 +9239,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9482,11 +9524,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9509,11 +9552,12 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9798,11 +9842,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9825,11 +9870,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10114,11 +10160,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10141,11 +10188,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10430,11 +10478,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10457,11 +10506,12 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10746,11 +10796,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10773,11 +10824,12 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11062,11 +11114,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11089,11 +11142,12 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11378,11 +11432,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11405,11 +11460,12 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11694,11 +11750,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11721,11 +11778,12 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12085,7 +12143,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12100,7 +12158,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12291,7 +12349,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12310,7 +12368,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12523,7 +12581,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12546,7 +12604,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12824,7 +12882,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_store: @@ -12835,7 +12893,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -12980,11 +13038,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_store: @@ -12995,11 +13054,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13144,11 +13204,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_store: @@ -13159,11 +13220,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13303,7 +13365,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: @@ -13315,7 +13377,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13478,7 +13540,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13492,7 +13554,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -13650,11 +13712,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw: @@ -13666,11 +13729,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13849,11 +13913,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13867,11 +13932,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14052,11 +14118,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -14070,11 +14137,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14262,7 +14330,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14281,7 +14349,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14492,11 +14560,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14515,11 +14584,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14730,11 +14800,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14753,11 +14824,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14980,7 +15052,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: @@ -14996,7 +15068,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15234,7 +15306,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15252,7 +15324,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15485,11 +15557,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: @@ -15505,11 +15578,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15763,11 +15837,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15785,11 +15860,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16045,11 +16121,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16067,11 +16144,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16311,7 +16389,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16329,7 +16407,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16569,7 +16647,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16587,7 +16665,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16843,11 +16921,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16865,11 +16944,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17125,11 +17205,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17147,11 +17228,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17407,11 +17489,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17429,11 +17512,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17689,11 +17773,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17711,11 +17796,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17971,11 +18057,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17993,11 +18080,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18253,11 +18341,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18275,11 +18364,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18535,11 +18625,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18557,11 +18648,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18817,11 +18909,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18839,11 +18932,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -19092,7 +19186,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19112,7 +19206,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19386,7 +19480,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19410,7 +19504,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19683,11 +19777,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19707,11 +19802,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20001,11 +20097,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20029,11 +20126,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20327,11 +20425,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20355,11 +20454,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20637,7 +20737,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20661,7 +20761,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20939,7 +21039,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20963,7 +21063,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21257,11 +21357,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21285,11 +21386,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21583,11 +21685,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21611,11 +21714,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21909,11 +22013,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21937,11 +22042,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22235,11 +22341,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22263,11 +22370,12 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22561,11 +22669,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22589,11 +22698,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22887,11 +22997,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22915,11 +23026,12 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23213,11 +23325,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23241,11 +23354,12 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23539,11 +23653,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23567,11 +23682,12 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 3bb871467c2309..e77f1432c1c9d0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -554,7 +554,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -778,7 +778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -800,7 +800,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_store: @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1235,11 +1235,12 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_store: @@ -1250,11 +1251,12 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1401,11 +1403,12 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_seq_cst_store: @@ -1416,11 +1419,12 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -1560,7 +1564,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw: @@ -1572,7 +1576,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1741,7 +1745,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -1755,7 +1759,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -1915,11 +1919,12 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_atomicrmw: @@ -1931,11 +1936,12 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2122,11 +2128,12 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2140,11 +2147,12 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2333,11 +2341,12 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2351,11 +2360,12 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2537,7 +2547,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2555,7 +2565,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2761,11 +2771,12 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2783,11 +2794,12 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2993,11 +3005,12 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3015,11 +3028,12 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3241,7 +3255,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: @@ -3257,7 +3271,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -3501,7 +3515,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -3519,7 +3533,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -3754,11 +3768,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg: @@ -3774,11 +3789,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4040,11 +4056,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4062,11 +4079,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4330,11 +4348,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4352,11 +4371,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4602,7 +4622,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4620,7 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4866,7 +4886,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4884,7 +4904,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5148,11 +5168,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5170,11 +5191,12 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5438,11 +5460,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5460,11 +5483,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5728,11 +5752,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5750,11 +5775,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6018,11 +6044,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6040,11 +6067,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6308,11 +6336,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6330,11 +6359,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6598,11 +6628,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6620,11 +6651,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6888,11 +6920,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6910,11 +6943,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -7178,11 +7212,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -7200,11 +7235,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -7453,7 +7489,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7473,7 +7509,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7741,7 +7777,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7764,7 +7800,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8038,11 +8074,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8062,11 +8099,12 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8352,11 +8390,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8379,11 +8418,12 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8672,11 +8712,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8699,11 +8740,12 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8974,7 +9016,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8997,7 +9039,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9268,7 +9310,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9291,7 +9333,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9580,11 +9622,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9607,11 +9650,12 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9900,11 +9944,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9927,11 +9972,12 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10220,11 +10266,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10247,11 +10294,12 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10540,11 +10588,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10567,11 +10616,12 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10860,11 +10910,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10887,11 +10938,12 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11180,11 +11232,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11207,11 +11260,12 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11500,11 +11554,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11527,11 +11582,12 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11820,11 +11876,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11847,11 +11904,12 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12211,7 +12269,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12226,7 +12284,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12419,7 +12477,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12438,7 +12496,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12653,7 +12711,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12676,7 +12734,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12954,7 +13012,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_store: @@ -12965,7 +13023,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13112,11 +13170,12 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_store: @@ -13127,11 +13186,12 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13278,11 +13338,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store: @@ -13293,11 +13354,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: @@ -13437,7 +13499,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -13449,7 +13511,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13614,7 +13676,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -13628,7 +13690,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -13788,11 +13850,12 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw: @@ -13804,11 +13867,12 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13991,11 +14055,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14009,11 +14074,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14198,11 +14264,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14216,11 +14283,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14410,7 +14478,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14429,7 +14497,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14644,11 +14712,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14667,11 +14736,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14886,11 +14956,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14909,11 +14980,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -15136,7 +15208,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: @@ -15152,7 +15224,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15392,7 +15464,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15410,7 +15482,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15645,11 +15717,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: @@ -15665,11 +15738,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15927,11 +16001,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15949,11 +16024,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16213,11 +16289,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16235,11 +16312,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16481,7 +16559,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16499,7 +16577,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16741,7 +16819,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16759,7 +16837,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17019,11 +17097,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17041,11 +17120,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17305,11 +17385,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17327,11 +17408,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17591,11 +17673,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17613,11 +17696,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17877,11 +17961,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17899,11 +17984,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18163,11 +18249,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18185,11 +18272,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18449,11 +18537,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18471,11 +18560,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -18735,11 +18825,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -18757,11 +18848,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -19021,11 +19113,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -19043,11 +19136,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -19296,7 +19390,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19316,7 +19410,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19592,7 +19686,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19616,7 +19710,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19891,11 +19985,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -19915,11 +20010,12 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -20213,11 +20309,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20241,11 +20338,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20543,11 +20641,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20571,11 +20670,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20855,7 +20955,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20879,7 +20979,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21159,7 +21259,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21183,7 +21283,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21481,11 +21581,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21509,11 +21610,12 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21811,11 +21913,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21839,11 +21942,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22141,11 +22245,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22169,11 +22274,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22471,11 +22577,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22499,11 +22606,12 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22801,11 +22909,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22829,11 +22938,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23131,11 +23241,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23159,11 +23270,12 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23461,11 +23573,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23489,11 +23602,12 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -23791,11 +23905,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -23819,11 +23934,12 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 8708fcf3b45913..6bf54ccabc9dad 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -901,7 +901,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1018,11 +1018,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 405168a1b5c246..8949e4b782f630 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -354,7 +354,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -549,7 +549,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -764,7 +764,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_store: @@ -1210,11 +1210,12 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_store: @@ -1367,11 +1368,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store: @@ -1523,7 +1525,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: @@ -1691,7 +1693,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -1858,11 +1860,12 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: @@ -2043,11 +2046,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2231,11 +2235,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2426,7 +2431,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2637,11 +2642,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2853,11 +2859,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3096,7 +3103,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: @@ -3343,7 +3350,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -3589,11 +3596,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: @@ -3853,11 +3861,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4120,11 +4129,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4375,7 +4385,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4625,7 +4635,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4887,11 +4897,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5154,11 +5165,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5421,11 +5433,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5688,11 +5701,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5959,7 +5973,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6242,7 +6256,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6531,11 +6545,12 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6831,11 +6846,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7135,11 +7151,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7427,7 +7444,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7714,7 +7731,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8013,11 +8030,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8317,11 +8335,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8621,11 +8640,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8925,11 +8945,12 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9229,11 +9250,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9533,11 +9555,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9837,11 +9860,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10141,11 +10165,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10527,7 +10552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10718,7 +10743,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10923,7 +10948,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11216,7 +11241,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store: @@ -11362,11 +11387,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store: @@ -11512,11 +11538,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store: @@ -11667,7 +11694,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: @@ -11827,7 +11854,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -11987,11 +12014,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: @@ -12157,11 +12185,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12329,11 +12358,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12518,7 +12548,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12719,11 +12749,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12924,11 +12955,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13166,7 +13198,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -13405,7 +13437,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13644,11 +13676,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: @@ -13893,11 +13926,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14144,11 +14178,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14389,7 +14424,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14630,7 +14665,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14877,11 +14912,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15128,11 +15164,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15379,11 +15416,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15630,11 +15668,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15881,11 +15920,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16132,11 +16172,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16383,11 +16424,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16634,11 +16676,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16903,7 +16946,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17182,7 +17225,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -17465,11 +17508,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -17754,11 +17798,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18047,11 +18092,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18334,7 +18380,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18617,7 +18663,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18906,11 +18952,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19199,11 +19246,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19492,11 +19540,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19785,11 +19834,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20078,11 +20128,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20371,11 +20422,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20664,11 +20716,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20957,11 +21010,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 793a1bab76d39a..b56860991b1948 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -394,7 +394,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -592,7 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -819,7 +819,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -838,7 +838,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_store: @@ -1178,7 +1178,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1351,11 +1351,12 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_store: @@ -1366,11 +1367,12 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1543,11 +1545,12 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_seq_cst_store: @@ -1558,11 +1561,12 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1710,7 +1714,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_atomicrmw: @@ -1720,7 +1724,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1893,7 +1897,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -1905,7 +1909,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2072,11 +2076,12 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_atomicrmw: @@ -2086,11 +2091,12 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2280,11 +2286,12 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2296,11 +2303,12 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2492,11 +2500,12 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -2508,11 +2517,12 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -2699,7 +2709,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2714,7 +2724,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2922,11 +2932,12 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2941,11 +2952,12 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3153,11 +3165,12 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3172,11 +3185,12 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3390,7 +3404,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: @@ -3405,7 +3419,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -3639,7 +3653,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -3656,7 +3670,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -3884,11 +3898,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg: @@ -3903,11 +3918,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4158,11 +4174,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4179,11 +4196,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4436,11 +4454,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4457,11 +4476,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4697,7 +4717,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4714,7 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -4950,7 +4970,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -4967,7 +4987,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5220,11 +5240,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5241,11 +5262,12 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5498,11 +5520,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5519,11 +5542,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -5776,11 +5800,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -5797,11 +5822,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6054,11 +6080,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6075,11 +6102,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6332,11 +6360,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6353,11 +6382,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6610,11 +6640,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6631,11 +6662,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -6888,11 +6920,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -6909,11 +6942,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7166,11 +7200,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -7187,11 +7222,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -7427,7 +7463,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7444,7 +7480,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -7697,7 +7733,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7717,7 +7753,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7975,11 +8011,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7996,11 +8033,12 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -8270,11 +8308,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8294,11 +8333,12 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8571,11 +8611,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8595,11 +8636,12 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8855,7 +8897,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8875,7 +8917,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9131,7 +9173,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9151,7 +9193,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9424,11 +9466,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9448,11 +9491,12 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9725,11 +9769,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9749,11 +9794,12 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10026,11 +10072,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10050,11 +10097,12 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10327,11 +10375,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10351,11 +10400,12 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10628,11 +10678,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10652,11 +10703,12 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10929,11 +10981,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10953,11 +11006,12 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11230,11 +11284,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11254,11 +11309,12 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11531,11 +11587,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11555,11 +11612,12 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11944,7 +12002,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -11956,7 +12014,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -12154,7 +12212,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12169,7 +12227,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12381,7 +12439,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12400,7 +12458,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12729,7 +12787,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store: @@ -12740,7 +12798,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -12913,11 +12971,12 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_store: @@ -12928,11 +12987,12 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -13105,11 +13165,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store: @@ -13120,11 +13181,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -13272,7 +13334,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: @@ -13282,7 +13344,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13455,7 +13517,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13467,7 +13529,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -13634,11 +13696,12 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_atomicrmw: @@ -13648,11 +13711,12 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13842,11 +13906,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -13858,11 +13923,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14054,11 +14120,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -14070,11 +14137,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -14261,7 +14329,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14276,7 +14344,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14484,11 +14552,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14503,11 +14572,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14715,11 +14785,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -14734,11 +14805,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -14952,7 +15024,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: @@ -14967,7 +15039,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15201,7 +15273,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15218,7 +15290,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15446,11 +15518,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: @@ -15465,11 +15538,12 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15720,11 +15794,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -15741,11 +15816,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -15998,11 +16074,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16019,11 +16096,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16259,7 +16337,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16276,7 +16354,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16512,7 +16590,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16529,7 +16607,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -16782,11 +16860,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -16803,11 +16882,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17060,11 +17140,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17081,11 +17162,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17338,11 +17420,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17359,11 +17442,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17616,11 +17700,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17637,11 +17722,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -17894,11 +17980,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -17915,11 +18002,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18172,11 +18260,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18193,11 +18282,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18450,11 +18540,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18471,11 +18562,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18728,11 +18820,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_endpgm @@ -18749,11 +18842,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm @@ -18989,7 +19083,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -19006,7 +19100,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -19259,7 +19353,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19279,7 +19373,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19552,11 +19646,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19576,11 +19671,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19853,11 +19949,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19877,11 +19974,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20137,7 +20235,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20157,7 +20255,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20413,7 +20511,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20433,7 +20531,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20706,11 +20804,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20730,11 +20829,12 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21007,11 +21107,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21031,11 +21132,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21308,11 +21410,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21332,11 +21435,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21609,11 +21713,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21633,11 +21738,12 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21910,11 +22016,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21934,11 +22041,12 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22211,11 +22319,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22235,11 +22344,12 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22512,11 +22622,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22536,11 +22647,12 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -22813,11 +22925,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -22837,11 +22950,12 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 1dbf0b2e977830..62a4f3b43b2dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -394,7 +394,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -594,7 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -609,7 +609,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -823,7 +823,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -842,7 +842,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_store: @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1357,11 +1357,12 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_store: @@ -1372,11 +1373,12 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1551,11 +1553,12 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_seq_cst_store: @@ -1566,11 +1569,12 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -1718,7 +1722,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_atomicrmw: @@ -1728,7 +1732,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1903,7 +1907,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -1915,7 +1919,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2084,11 +2088,12 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_atomicrmw: @@ -2098,11 +2103,12 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2296,11 +2302,12 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2312,11 +2319,12 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2512,11 +2520,12 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -2528,11 +2537,12 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -2721,7 +2731,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2736,7 +2746,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -2948,11 +2958,12 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2967,11 +2978,12 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3183,11 +3195,12 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3202,11 +3215,12 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -3420,7 +3434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -3435,7 +3449,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -3671,7 +3685,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -3688,7 +3702,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -3918,11 +3932,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg: @@ -3937,11 +3952,12 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4196,11 +4212,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4217,11 +4234,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4478,11 +4496,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4499,11 +4518,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4741,7 +4761,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -4758,7 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -4996,7 +5016,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5013,7 +5033,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5270,11 +5290,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5291,11 +5312,12 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5552,11 +5574,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5573,11 +5596,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -5834,11 +5858,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -5855,11 +5880,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6116,11 +6142,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -6137,11 +6164,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -6377,7 +6405,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -6394,7 +6422,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -6649,7 +6677,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -6669,7 +6697,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -6946,11 +6974,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -6970,11 +6999,12 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7251,11 +7281,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7275,11 +7306,12 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7537,7 +7569,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7557,7 +7589,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -7815,7 +7847,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7835,7 +7867,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8112,11 +8144,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8136,11 +8169,12 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8417,11 +8451,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8441,11 +8476,12 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -8722,11 +8758,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8746,11 +8783,12 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9027,11 +9065,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9051,11 +9090,12 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9332,11 +9372,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9356,11 +9397,12 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9637,11 +9679,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9661,11 +9704,12 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -9942,11 +9986,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9966,11 +10011,12 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10247,11 +10293,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10271,11 +10318,12 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10660,7 +10708,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -10672,7 +10720,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -10872,7 +10920,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10887,7 +10935,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11101,7 +11149,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11120,7 +11168,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11449,7 +11497,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_store: @@ -11460,7 +11508,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11635,11 +11683,12 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_store: @@ -11650,11 +11699,12 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11829,11 +11879,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store: @@ -11844,11 +11895,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: @@ -11996,7 +12048,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw: @@ -12006,7 +12058,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -12181,7 +12233,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12193,7 +12245,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12362,11 +12414,12 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw: @@ -12376,11 +12429,12 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -12574,11 +12628,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12590,11 +12645,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12790,11 +12846,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -12806,11 +12863,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -12999,7 +13057,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13014,7 +13072,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13226,11 +13284,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13245,11 +13304,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13461,11 +13521,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13480,11 +13541,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -13698,7 +13760,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -13713,7 +13775,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -13949,7 +14011,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -13966,7 +14028,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14196,11 +14258,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -14215,11 +14278,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -14474,11 +14538,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14495,11 +14560,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -14756,11 +14822,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -14777,11 +14844,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15019,7 +15087,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15036,7 +15104,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15274,7 +15342,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15291,7 +15359,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15548,11 +15616,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15569,11 +15638,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -15830,11 +15900,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -15851,11 +15922,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16112,11 +16184,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16133,11 +16206,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16394,11 +16468,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16415,11 +16490,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16676,11 +16752,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16697,11 +16774,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -16958,11 +17036,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -16979,11 +17058,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17240,11 +17320,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17261,11 +17342,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17522,11 +17604,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_endpgm @@ -17543,11 +17626,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm @@ -17783,7 +17867,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -17800,7 +17884,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -18055,7 +18139,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18075,7 +18159,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -18335,11 +18419,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -18356,11 +18441,12 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm @@ -18634,11 +18720,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18658,11 +18745,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -18939,11 +19027,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18963,11 +19052,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19225,7 +19315,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19245,7 +19335,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19503,7 +19593,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19523,7 +19613,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -19800,11 +19890,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19824,11 +19915,12 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20105,11 +20197,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20129,11 +20222,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20410,11 +20504,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20434,11 +20529,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -20715,11 +20811,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20739,11 +20836,12 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21020,11 +21118,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21044,11 +21143,12 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21325,11 +21425,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21349,11 +21450,12 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21630,11 +21732,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21654,11 +21757,12 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -21935,11 +22039,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21959,11 +22064,12 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 -; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 11206cb695aa4a..a98efb49b4b72b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -837,7 +837,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -972,11 +972,12 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_volatile_workgroup_release_store: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 643684b7550add..30bf4920715352 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -382,7 +382,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -582,7 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -794,7 +794,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_store: @@ -1316,11 +1316,12 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_store: @@ -1501,11 +1502,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_store: @@ -1665,7 +1667,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_atomicrmw: @@ -1831,7 +1833,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2004,11 +2006,12 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_atomicrmw: @@ -2188,11 +2191,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2374,11 +2378,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -2566,7 +2571,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2772,11 +2777,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -2983,11 +2989,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -3214,7 +3221,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: @@ -3446,7 +3453,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -3685,11 +3692,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: @@ -3935,11 +3943,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4187,11 +4196,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4426,7 +4436,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4660,7 +4670,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -4907,11 +4917,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5159,11 +5170,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5411,11 +5423,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5663,11 +5676,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -5915,11 +5929,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6167,11 +6182,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6419,11 +6435,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6671,11 +6688,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -6927,7 +6945,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7186,7 +7204,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -7457,11 +7475,12 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -7734,11 +7753,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8014,11 +8034,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8281,7 +8302,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8543,7 +8564,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -8818,11 +8839,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9098,11 +9120,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9378,11 +9401,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9658,11 +9682,12 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -9938,11 +9963,12 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10218,11 +10244,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10498,11 +10525,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10778,11 +10806,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11185,7 +11214,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -11385,7 +11414,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11594,7 +11623,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11935,7 +11964,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store: @@ -12108,11 +12137,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_store: @@ -12285,11 +12315,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store: @@ -12448,7 +12479,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: @@ -12614,7 +12645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -12780,11 +12811,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_atomicrmw: @@ -12956,11 +12988,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13134,11 +13167,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -13325,7 +13359,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13524,11 +13558,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13727,11 +13762,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -13957,7 +13993,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -14189,7 +14225,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14421,11 +14457,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: @@ -14663,11 +14700,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -14907,11 +14945,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15145,7 +15184,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15379,7 +15418,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15619,11 +15658,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -15863,11 +15903,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16107,11 +16148,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16351,11 +16393,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16595,11 +16638,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -16839,11 +16883,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17083,11 +17128,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17327,11 +17373,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE ; GFX12-WGP-NEXT: s_endpgm @@ -17582,7 +17629,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -17841,7 +17888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18105,11 +18152,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-WGP-NEXT: s_endpgm @@ -18374,11 +18422,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18646,11 +18695,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -18912,7 +18962,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19174,7 +19224,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19442,11 +19492,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19714,11 +19765,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -19986,11 +20038,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20258,11 +20311,12 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20530,11 +20584,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -20802,11 +20857,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21074,11 +21130,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -21346,11 +21403,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 -; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index ff1538b146d20b..02cd97c9fe82a7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 6d52a4d69b18c4..1c4c8d41b18f9d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 2a15a0d0727fd6..a52dd9b3401696 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -834,6 +834,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 0f56342c825b0b..c2429632285378 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -1140,6 +1140,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1298,6 +1299,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1752,6 +1754,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -1923,6 +1926,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2097,6 +2101,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2486,6 +2491,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -2692,6 +2698,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3270,6 +3277,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3481,6 +3489,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -3695,6 +3704,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4301,6 +4311,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4515,6 +4526,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4729,6 +4741,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -4943,6 +4956,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5157,6 +5171,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5371,6 +5386,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5585,6 +5601,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -5799,6 +5816,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6480,6 +6498,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6723,6 +6742,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -6967,6 +6987,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7663,6 +7684,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -7907,6 +7929,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8151,6 +8174,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8395,6 +8419,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8639,6 +8664,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -8883,6 +8909,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9127,6 +9154,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -9371,6 +9399,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir index cfd1bd4d13ce71..ab6d207cd96686 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_fmamk_reg_imm_f64 # GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir index 379133f9417c89..daac34bab0fd0e 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir @@ -1,7 +1,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck --check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck --check-prefixes=GCN %s # GCN-LABEL: name: test_fmamk_reg_imm_f32 # GCN: %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir index f5d147d83404b7..7062878a846097 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_madmk_reg_imm_f32 # GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir index 8aaba0060723bc..5fbb149548909d 100644 --- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - -early-live-intervals | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 # GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll index 683ba98e52cf15..9cfba8b2e5c04a 100644 --- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll +++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s ; RUN: llc -mtriple=r600-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s target datalayout = "A5" diff --git a/llvm/test/CodeGen/AMDGPU/verifier-sdwa-cvt.mir b/llvm/test/CodeGen/AMDGPU/verifier-sdwa-cvt.mir index 2066637a34af0e..a139a2e3389840 100644 --- a/llvm/test/CodeGen/AMDGPU/verifier-sdwa-cvt.mir +++ b/llvm/test/CodeGen/AMDGPU/verifier-sdwa-cvt.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx940 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx940 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" %s # CHECK: *** Bad machine code: sext, abs and neg are not allowed on this instruction *** # CHECK: $vgpr0 = V_CVT_F32_FP8_sdwa 1, $vgpr0, 0, 0, 4, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir index 790ecd21e21f7c..81d17a8fd0f90f 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir @@ -1,6 +1,9 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s # GFX9-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** # GFX9-ERR: $vgpr0 = V_CNDMASK_B32_e64 0, $sgpr0, 0, -1, killed $sgpr0_sgpr1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir b/llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir index bdb273cba79c66..e5521ee3efb4e4 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s +# RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s # GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** # GFX90A-ERR: DS_GWS_INIT killed %0.sub1:areg_128_align2, 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource") diff --git a/llvm/test/CodeGen/AMDGPU/verify-duplicate-literal.mir b/llvm/test/CodeGen/AMDGPU/verify-duplicate-literal.mir index da389743daf3ac..a13d601f79fd4a 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-duplicate-literal.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-duplicate-literal.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass machineverifier -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 --passes='machine-function(verify)' -o - %s | FileCheck %s # Two uses of the same literal only count as one use of the constant bus. diff --git a/llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir index 12ed2895012b61..c4c1dcf242a5a3 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-gfx90a-aligned-vgprs.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx90a --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck %s # Implicit uses are OK. --- diff --git a/llvm/test/CodeGen/AMDGPU/verify-image-vaddr-align.mir b/llvm/test/CodeGen/AMDGPU/verify-image-vaddr-align.mir index ca6fa25d8c919e..dcdc105724a2e2 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-image-vaddr-align.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-image-vaddr-align.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s +# RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s # GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for vaddr operand of image instructions *** # GFX90A-ERR: %4:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx90a %0.sub1:vreg_128_align2 diff --git a/llvm/test/CodeGen/AMDGPU/verify-image.mir b/llvm/test/CodeGen/AMDGPU/verify-image.mir index 5bb7303968a7eb..98eaec600aeb13 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-image.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-image.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s --- name: image_verify diff --git a/llvm/test/CodeGen/AMDGPU/verify-scalar-store.mir b/llvm/test/CodeGen/AMDGPU/verify-scalar-store.mir index 3184bbd8bd2b08..6fc399b1da3421 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-scalar-store.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-scalar-store.mir @@ -1,5 +1,7 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=tonga -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX8-ERR %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=machineverifier -o - %s 2>&1 | FileCheck -check-prefix=GFX9 %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=tonga --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX8-ERR %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o - %s 2>&1 | FileCheck -check-prefix=GFX9 %s # GFX8-ERR: *** Bad machine code: scalar stores must use m0 as offset register *** # GFX9: S_STORE_DWORD_SGPR diff --git a/llvm/test/CodeGen/AMDGPU/verify-sop.mir b/llvm/test/CodeGen/AMDGPU/verify-sop.mir index 149b6484290d81..e7fc19e9c9cc43 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-sop.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -run-pass machineverifier %s -o - 2>&1 | FileCheck %s +# RUN: not --crash llc -mtriple=amdgcn --passes='machine-function(verify)' %s -o - 2>&1 | FileCheck %s # CHECK: *** Bad machine code: SOP2/SOPC instruction requires too many immediate constants # CHECK: - instruction: %0:sreg_32_xm0 = S_ADD_I32 diff --git a/llvm/test/CodeGen/AMDGPU/verify-vimage-vsample.mir b/llvm/test/CodeGen/AMDGPU/verify-vimage-vsample.mir index 12caf08338f44d..845a17df4e8b6d 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-vimage-vsample.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-vimage-vsample.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s --- name: vimage_vsample_verify diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir index 6614d8f9c4b09c..6c55183bb52879 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s # GFX12-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** # GFX12-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir index 374f8989571937..dc7d4afa857410 100644 --- a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir +++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s +# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s # GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** # GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 6410df7f69e2ac..7301b341cbc71d 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -866,90 +866,5 @@ bb.3: ret void } -; This should not cause Assertion `getType() == V->getType() && "All operands to PHI node must be the same type as the PHI node -; Note: whether or not the assertion fires depends on the iteration ortder of PhiNodes in AMDGPULateCodeGenPrepare, which -; is non-deterministic due to iterators over a set of pointers. - - -define amdgpu_kernel void @MissingInc_PhiChain(i1 %cmp, <16 x i8> %input) { -; GFX906-LABEL: MissingInc_PhiChain: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX906-NEXT: s_mov_b32 s10, 1 -; GFX906-NEXT: v_mov_b32_e32 v4, 1 -; GFX906-NEXT: s_mov_b32 s11, 1 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_bitcmp1_b32 s0, 0 -; GFX906-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX906-NEXT: s_xor_b64 s[0:1], s[2:3], -1 -; GFX906-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX906-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX906-NEXT: s_branch .LBB14_2 -; GFX906-NEXT: .LBB14_1: ; %bb.5 -; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX906-NEXT: s_mov_b32 s10, 0 -; GFX906-NEXT: s_mov_b32 s11, 0 -; GFX906-NEXT: .LBB14_2: ; %bb.1 -; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX906-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX906-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX906-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX906-NEXT: s_cbranch_vccnz .LBB14_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: v_lshlrev_b16_e64 v0, 8, s11 -; GFX906-NEXT: v_or_b32_sdwa v0, s10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v4 -; GFX906-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX906-NEXT: s_mov_b64 s[8:9], -1 -; GFX906-NEXT: .LBB14_4: ; %Flow -; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX906-NEXT: s_cbranch_vccnz .LBB14_7 -; GFX906-NEXT: ; %bb.5: ; %bb.3 -; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX906-NEXT: s_cbranch_vccnz .LBB14_1 -; GFX906-NEXT: ; %bb.6: ; %bb.4 -; GFX906-NEXT: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s5 -; GFX906-NEXT: v_mov_b32_e32 v2, s6 -; GFX906-NEXT: v_mov_b32_e32 v3, s7 -; GFX906-NEXT: s_branch .LBB14_1 -; GFX906-NEXT: .LBB14_7: ; in Loop: Header=BB14_2 Depth=1 -; GFX906-NEXT: ; implicit-def: $vgpr4 -; GFX906-NEXT: ; implicit-def: $sgpr10 -; GFX906-NEXT: ; implicit-def: $sgpr11 -; GFX906-NEXT: s_cbranch_execz .LBB14_2 -; GFX906-NEXT: ; %bb.8: ; %DummyReturnBlock -; GFX906-NEXT: s_endpgm -entry: - br label %bb.1 - -bb.1: ; preds = %bb.5, %entry - %phi1 = phi <16 x i8> [ , %entry ], [ %shuffle, %bb.5 ] - br i1 %cmp, label %bb.3, label %bb.2 - -bb.2: ; preds = %bb.1 - %insert = insertelement <16 x i8> %phi1, i8 0, i64 0 - br label %bb.3 - -bb.3: ; preds = %bb.2, %bb.1 - %phi2 = phi <16 x i8> [ %insert, %bb.2 ], [ %phi1, %bb.1 ] - br i1 %cmp, label %bb.5, label %bb.4 - -bb.4: ; preds = %bb.3 - br label %bb.5 - -bb.5: ; preds = %bb.4, %bb.3 - %phi3 = phi <16 x i8> [ %input, %bb.4 ], [ %phi2, %bb.3 ] - %shuffle = shufflevector <16 x i8> %phi3, <16 x i8> zeroinitializer, <16 x i32> - br label %bb.1 -} - - declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll new file mode 100644 index 00000000000000..6e493c993751c9 --- /dev/null +++ b/llvm/test/CodeGen/ARM/scmp.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s + +define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp_8_8: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: movwgt r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp_8_16: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: movwgt r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_8_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: movwgt r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_8_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlt lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlt r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: pop {r11, pc} + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp_8_128: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: ldr r4, [sp, #24] +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: ldr r6, [sp, #28] +; CHECK-NEXT: subs r7, r0, r4 +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: sbcs r7, r1, r6 +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: sbcs r7, r2, r12 +; CHECK-NEXT: sbcs r7, r3, lr +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: movwlt r7, #1 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r6, r1 +; CHECK-NEXT: sbcs r0, r12, r2 +; CHECK-NEXT: sbcs r0, lr, r3 +; CHECK-NEXT: movwlt r5, #1 +; CHECK-NEXT: sub r0, r5, r7 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_32_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: movwgt r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_32_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlt lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlt r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: pop {r11, pc} + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_64_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlt lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlt r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: pop {r11, pc} + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll new file mode 100644 index 00000000000000..ad4af534ee8fea --- /dev/null +++ b/llvm/test/CodeGen/ARM/ucmp.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s + +define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_8: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlo r0, #1 +; CHECK-NEXT: movwhi r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_16: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlo r0, #1 +; CHECK-NEXT: movwhi r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_8_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlo r0, #1 +; CHECK-NEXT: movwhi r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_8_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlo lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlo r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: pop {r11, pc} + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp_8_128: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: ldr r4, [sp, #24] +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: ldr r6, [sp, #28] +; CHECK-NEXT: subs r7, r0, r4 +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: sbcs r7, r1, r6 +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: sbcs r7, r2, r12 +; CHECK-NEXT: sbcs r7, r3, lr +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: movwlo r7, #1 +; CHECK-NEXT: subs r0, r4, r0 +; CHECK-NEXT: sbcs r0, r6, r1 +; CHECK-NEXT: sbcs r0, r12, r2 +; CHECK-NEXT: sbcs r0, lr, r3 +; CHECK-NEXT: movwlo r5, #1 +; CHECK-NEXT: sub r0, r5, r7 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_32_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlo r0, #1 +; CHECK-NEXT: movwhi r2, #1 +; CHECK-NEXT: sub r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_32_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlo lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlo r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: pop {r11, pc} + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_64_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: subs lr, r0, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: sbcs lr, r1, r3 +; CHECK-NEXT: mov lr, #0 +; CHECK-NEXT: movwlo lr, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r3, r1 +; CHECK-NEXT: movwlo r12, #1 +; CHECK-NEXT: sub r0, r12, lr +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: pop {r11, pc} + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir index 87e117c461b64d..c533a5a167ab2f 100644 --- a/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir +++ b/llvm/test/CodeGen/Hexagon/two-addr-tied-subregs.mir @@ -1,4 +1,5 @@ # RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march hexagon --passes='require,two-address-instruction' -o - %s | FileCheck %s ############################################################################### diff --git a/llvm/test/CodeGen/LoongArch/scmp.ll b/llvm/test/CodeGen/LoongArch/scmp.ll new file mode 100644 index 00000000000000..69a92968173d24 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/scmp.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 -mattr=+d --verify-machineinstrs < %s | FileCheck %s + +define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp.8.8: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp.8.16: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.8.32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a1, $a1, 0 +; CHECK-NEXT: addi.w $a0, $a0, 0 +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.8.64: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp.8.128: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a4, $a1, $a3 +; CHECK-NEXT: xor $a5, $a1, $a3 +; CHECK-NEXT: sltui $a5, $a5, 1 +; CHECK-NEXT: masknez $a4, $a4, $a5 +; CHECK-NEXT: sltu $a6, $a0, $a2 +; CHECK-NEXT: maskeqz $a6, $a6, $a5 +; CHECK-NEXT: or $a4, $a6, $a4 +; CHECK-NEXT: slt $a1, $a3, $a1 +; CHECK-NEXT: masknez $a1, $a1, $a5 +; CHECK-NEXT: sltu $a0, $a2, $a0 +; CHECK-NEXT: maskeqz $a0, $a0, $a5 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: sub.d $a0, $a0, $a4 +; CHECK-NEXT: ret + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.32.32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a1, $a1, 0 +; CHECK-NEXT: addi.w $a0, $a0, 0 +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.32.64: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.64.64: +; CHECK: # %bb.0: +; CHECK-NEXT: slt $a2, $a0, $a1 +; CHECK-NEXT: slt $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll b/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll new file mode 100644 index 00000000000000..68ad655130f5a8 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/typepromotion-overflow.ll @@ -0,0 +1,642 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64 + +define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) { +; LA32-LABEL: overflow_add: +; LA32: # %bb.0: +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: ori $a0, $a0, 1 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 +; LA32-NEXT: ori $a1, $zero, 1024 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 5 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_add: +; LA64: # %bb.0: +; LA64-NEXT: add.d $a0, $a1, $a0 +; LA64-NEXT: ori $a0, $a0, 1 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-NEXT: ori $a1, $zero, 1024 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 5 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = add i16 %b, %a + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) { +; LA32-LABEL: overflow_sub: +; LA32: # %bb.0: +; LA32-NEXT: sub.w $a0, $a0, $a1 +; LA32-NEXT: ori $a0, $a0, 1 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 +; LA32-NEXT: ori $a1, $zero, 1024 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 5 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_sub: +; LA64: # %bb.0: +; LA64-NEXT: sub.d $a0, $a0, $a1 +; LA64-NEXT: ori $a0, $a0, 1 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-NEXT: ori $a1, $zero, 1024 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 5 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = sub i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) { +; LA32-LABEL: overflow_mul: +; LA32: # %bb.0: +; LA32-NEXT: mul.w $a0, $a1, $a0 +; LA32-NEXT: ori $a0, $a0, 1 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 +; LA32-NEXT: ori $a1, $zero, 1024 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 5 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_mul: +; LA64: # %bb.0: +; LA64-NEXT: mul.d $a0, $a1, $a0 +; LA64-NEXT: ori $a0, $a0, 1 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-NEXT: ori $a1, $zero, 1024 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 5 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = mul i16 %b, %a + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) { +; LA32-LABEL: overflow_shl: +; LA32: # %bb.0: +; LA32-NEXT: sll.w $a0, $a0, $a1 +; LA32-NEXT: ori $a0, $a0, 1 +; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0 +; LA32-NEXT: ori $a1, $zero, 1024 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 5 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_shl: +; LA64: # %bb.0: +; LA64-NEXT: sll.d $a0, $a0, $a1 +; LA64-NEXT: ori $a0, $a0, 1 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-NEXT: ori $a1, $zero, 1024 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 5 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = shl i16 %a, %b + %or = or i16 %add, 1 + %cmp = icmp ugt i16 %or, 1024 + %res = select i1 %cmp, i16 2, i16 5 + ret i16 %res +} + +define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) { +; LA32-LABEL: overflow_add_no_consts: +; LA32: # %bb.0: +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltu $a0, $a2, $a0 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_add_no_consts: +; LA64: # %bb.0: +; LA64-NEXT: add.d $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltu $a0, $a2, $a0 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = add i8 %b, %a + %cmp = icmp ugt i8 %add, %limit + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) { +; LA32-LABEL: overflow_add_const_limit: +; LA32: # %bb.0: +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: ori $a1, $zero, 128 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_add_const_limit: +; LA64: # %bb.0: +; LA64-NEXT: add.d $a0, $a1, $a0 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: ori $a1, $zero, 128 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = add i8 %b, %a + %cmp = icmp ugt i8 %add, -128 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @overflow_add_positive_const_limit(i8 zeroext %a) { +; LA32-LABEL: overflow_add_positive_const_limit: +; LA32: # %bb.0: +; LA32-NEXT: ext.w.b $a0, $a0 +; LA32-NEXT: slti $a0, $a0, -1 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_add_positive_const_limit: +; LA64: # %bb.0: +; LA64-NEXT: ext.w.b $a0, $a0 +; LA64-NEXT: slti $a0, $a0, -1 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp slt i8 %a, -1 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @unsafe_add_underflow(i8 zeroext %a) { +; LA32-LABEL: unsafe_add_underflow: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $a0, $a0, -1 +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: unsafe_add_underflow: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $a0, $a0, -1 +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp eq i8 %a, 1 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @safe_add_underflow(i8 zeroext %a) { +; LA32-LABEL: safe_add_underflow: +; LA32: # %bb.0: +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_add_underflow: +; LA64: # %bb.0: +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp eq i8 %a, 0 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @safe_add_underflow_neg(i8 zeroext %a) { +; LA32-LABEL: safe_add_underflow_neg: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $a0, $a0, -2 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 251 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_add_underflow_neg: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $a0, $a0, -2 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 251 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %add = add i8 %a, -2 + %cmp = icmp ult i8 %add, -5 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) { +; LA32-LABEL: overflow_sub_negative_const_limit: +; LA32: # %bb.0: +; LA32-NEXT: ext.w.b $a0, $a0 +; LA32-NEXT: slti $a0, $a0, -1 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: overflow_sub_negative_const_limit: +; LA64: # %bb.0: +; LA64-NEXT: ext.w.b $a0, $a0 +; LA64-NEXT: slti $a0, $a0, -1 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp slt i8 %a, -1 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @sext_sub_underflow(i8 zeroext %a) { +; LA32-LABEL: sext_sub_underflow: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $a0, $a0, -6 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: ori $a1, $zero, 250 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: sext_sub_underflow: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $a0, $a0, -6 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: ori $a1, $zero, 250 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %sub = add i8 %a, -6 + %cmp = icmp ugt i8 %sub, -6 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @safe_sub_underflow(i8 zeroext %a) { +; LA32-LABEL: safe_sub_underflow: +; LA32: # %bb.0: +; LA32-NEXT: sltui $a0, $a0, 1 +; LA32-NEXT: ori $a1, $zero, 8 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 16 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_sub_underflow: +; LA64: # %bb.0: +; LA64-NEXT: sltui $a0, $a0, 1 +; LA64-NEXT: ori $a1, $zero, 8 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 16 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp.not = icmp eq i8 %a, 0 + %res = select i1 %cmp.not, i32 16, i32 8 + ret i32 %res +} + +define i32 @safe_sub_underflow_neg(i8 zeroext %a) { +; LA32-LABEL: safe_sub_underflow_neg: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $a0, $a0, -4 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: ori $a1, $zero, 250 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_sub_underflow_neg: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $a0, $a0, -4 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: ori $a1, $zero, 250 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %sub = add i8 %a, -4 + %cmp = icmp ugt i8 %sub, -6 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @sext_sub_underflow_neg(i8 zeroext %a) { +; LA32-LABEL: sext_sub_underflow_neg: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $a0, $a0, -4 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: sltui $a0, $a0, 253 +; LA32-NEXT: ori $a1, $zero, 16 +; LA32-NEXT: masknez $a1, $a1, $a0 +; LA32-NEXT: ori $a2, $zero, 8 +; LA32-NEXT: maskeqz $a0, $a2, $a0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: sext_sub_underflow_neg: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $a0, $a0, -4 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: sltui $a0, $a0, 253 +; LA64-NEXT: ori $a1, $zero, 16 +; LA64-NEXT: masknez $a1, $a1, $a0 +; LA64-NEXT: ori $a2, $zero, 8 +; LA64-NEXT: maskeqz $a0, $a2, $a0 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %sub = add i8 %a, -4 + %cmp = icmp ult i8 %sub, -3 + %res = select i1 %cmp, i32 8, i32 16 + ret i32 %res +} + +define i32 @safe_sub_imm_var(ptr nocapture readonly %b) local_unnamed_addr #1 { +; LA32-LABEL: safe_sub_imm_var: +; LA32: # %bb.0: # %entry +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: ret +; +; LA64-LABEL: safe_sub_imm_var: +; LA64: # %bb.0: # %entry +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: ret +entry: + ret i32 0 +} + +define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 { +; LA32-LABEL: safe_sub_var_imm: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.b $a0, $a0, 0 +; LA32-NEXT: addi.w $a0, $a0, 8 +; LA32-NEXT: andi $a0, $a0, 255 +; LA32-NEXT: ori $a1, $zero, 252 +; LA32-NEXT: sltu $a0, $a1, $a0 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_sub_var_imm: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.b $a0, $a0, 0 +; LA64-NEXT: addi.d $a0, $a0, 8 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: ori $a1, $zero, 252 +; LA64-NEXT: sltu $a0, $a1, $a0 +; LA64-NEXT: ret +entry: + %0 = load i8, ptr %b, align 1 + %sub = add nsw i8 %0, 8 + %cmp = icmp ugt i8 %sub, -4 + %conv4 = zext i1 %cmp to i32 + ret i32 %conv4 +} + +define i32 @safe_add_imm_var(ptr nocapture readnone %b) { +; LA32-LABEL: safe_add_imm_var: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ori $a0, $zero, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_add_imm_var: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ori $a0, $zero, 1 +; LA64-NEXT: ret +entry: + ret i32 1 +} + +define i32 @safe_add_var_imm(ptr nocapture readnone %b) { +; LA32-LABEL: safe_add_var_imm: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ori $a0, $zero, 1 +; LA32-NEXT: ret +; +; LA64-LABEL: safe_add_var_imm: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ori $a0, $zero, 1 +; LA64-NEXT: ret +entry: + ret i32 1 +} + +define i8 @convert_add_order(i8 zeroext %arg) { +; LA32-LABEL: convert_add_order: +; LA32: # %bb.0: +; LA32-NEXT: ori $a1, $a0, 1 +; LA32-NEXT: sltui $a2, $a1, 50 +; LA32-NEXT: addi.w $a1, $a1, -40 +; LA32-NEXT: andi $a1, $a1, 255 +; LA32-NEXT: sltui $a1, $a1, 20 +; LA32-NEXT: ori $a3, $zero, 2 +; LA32-NEXT: sub.w $a1, $a3, $a1 +; LA32-NEXT: addi.w $a3, $zero, -1 +; LA32-NEXT: masknez $a3, $a3, $a2 +; LA32-NEXT: maskeqz $a1, $a1, $a2 +; LA32-NEXT: or $a1, $a1, $a3 +; LA32-NEXT: and $a0, $a1, $a0 +; LA32-NEXT: ret +; +; LA64-LABEL: convert_add_order: +; LA64: # %bb.0: +; LA64-NEXT: ori $a1, $a0, 1 +; LA64-NEXT: sltui $a2, $a1, 50 +; LA64-NEXT: addi.d $a1, $a1, -40 +; LA64-NEXT: andi $a1, $a1, 255 +; LA64-NEXT: sltui $a1, $a1, 20 +; LA64-NEXT: ori $a3, $zero, 2 +; LA64-NEXT: sub.d $a1, $a3, $a1 +; LA64-NEXT: addi.w $a3, $zero, -1 +; LA64-NEXT: masknez $a3, $a3, $a2 +; LA64-NEXT: maskeqz $a1, $a1, $a2 +; LA64-NEXT: or $a1, $a1, $a3 +; LA64-NEXT: and $a0, $a1, $a0 +; LA64-NEXT: ret + %shl = or i8 %arg, 1 + %cmp.0 = icmp ult i8 %shl, 50 + %sub = add nsw i8 %shl, -40 + %cmp.1 = icmp ult i8 %sub, 20 + %mask.sel.v = select i1 %cmp.1, i8 1, i8 2 + %mask.sel = select i1 %cmp.0, i8 %mask.sel.v, i8 -1 + %res = and i8 %mask.sel, %arg + ret i8 %res +} + +define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) { +; LA32-LABEL: underflow_if_sub: +; LA32: # %bb.0: +; LA32-NEXT: slt $a2, $zero, $a0 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: addi.w $a0, $a0, -11 +; LA32-NEXT: andi $a2, $a0, 247 +; LA32-NEXT: sltu $a1, $a2, $a1 +; LA32-NEXT: maskeqz $a0, $a0, $a1 +; LA32-NEXT: ori $a2, $zero, 100 +; LA32-NEXT: masknez $a1, $a2, $a1 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: underflow_if_sub: +; LA64: # %bb.0: +; LA64-NEXT: addi.w $a2, $a0, 0 +; LA64-NEXT: slt $a2, $zero, $a2 +; LA64-NEXT: and $a0, $a2, $a0 +; LA64-NEXT: addi.d $a0, $a0, -11 +; LA64-NEXT: andi $a2, $a0, 247 +; LA64-NEXT: sltu $a1, $a2, $a1 +; LA64-NEXT: maskeqz $a0, $a0, $a1 +; LA64-NEXT: ori $a2, $zero, 100 +; LA64-NEXT: masknez $a1, $a2, $a1 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp sgt i32 %arg, 0 + %conv = zext i1 %cmp to i32 + %and = and i32 %conv, %arg + %trunc = trunc i32 %and to i8 + %conv1 = add nuw nsw i8 %trunc, -11 + %cmp.1 = icmp ult i8 %conv1, %arg1 + %res = select i1 %cmp.1, i8 %conv1, i8 100 + ret i8 %res +} + +define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) { +; LA32-LABEL: underflow_if_sub_signext: +; LA32: # %bb.0: +; LA32-NEXT: slt $a2, $zero, $a0 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: addi.w $a0, $a0, -11 +; LA32-NEXT: sltu $a1, $a0, $a1 +; LA32-NEXT: maskeqz $a0, $a0, $a1 +; LA32-NEXT: ori $a2, $zero, 100 +; LA32-NEXT: masknez $a1, $a2, $a1 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: underflow_if_sub_signext: +; LA64: # %bb.0: +; LA64-NEXT: addi.w $a2, $a0, 0 +; LA64-NEXT: slt $a2, $zero, $a2 +; LA64-NEXT: and $a0, $a2, $a0 +; LA64-NEXT: addi.d $a0, $a0, -11 +; LA64-NEXT: sltu $a1, $a0, $a1 +; LA64-NEXT: maskeqz $a0, $a0, $a1 +; LA64-NEXT: ori $a2, $zero, 100 +; LA64-NEXT: masknez $a1, $a2, $a1 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret + %cmp = icmp sgt i32 %arg, 0 + %conv = zext i1 %cmp to i32 + %and = and i32 %conv, %arg + %trunc = trunc i32 %and to i8 + %conv1 = add nuw nsw i8 %trunc, -11 + %cmp.1 = icmp ult i8 %conv1, %arg1 + %res = select i1 %cmp.1, i8 %conv1, i8 100 + ret i8 %res +} diff --git a/llvm/test/CodeGen/LoongArch/ucmp.ll b/llvm/test/CodeGen/LoongArch/ucmp.ll new file mode 100644 index 00000000000000..548c5bd0db72ba --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/ucmp.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 -mattr=+d --verify-machineinstrs < %s | FileCheck %s + +define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.8: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.16: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.8.32: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrpick.d $a1, $a1, 31, 0 +; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0 +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.8.64: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp.8.128: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a4, $a1, $a3 +; CHECK-NEXT: xor $a5, $a1, $a3 +; CHECK-NEXT: sltui $a5, $a5, 1 +; CHECK-NEXT: masknez $a4, $a4, $a5 +; CHECK-NEXT: sltu $a6, $a0, $a2 +; CHECK-NEXT: maskeqz $a6, $a6, $a5 +; CHECK-NEXT: or $a4, $a6, $a4 +; CHECK-NEXT: sltu $a1, $a3, $a1 +; CHECK-NEXT: masknez $a1, $a1, $a5 +; CHECK-NEXT: sltu $a0, $a2, $a0 +; CHECK-NEXT: maskeqz $a0, $a0, $a5 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: sub.d $a0, $a0, $a4 +; CHECK-NEXT: ret + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.32.32: +; CHECK: # %bb.0: +; CHECK-NEXT: bstrpick.d $a1, $a1, 31, 0 +; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0 +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.32.64: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.64.64: +; CHECK: # %bb.0: +; CHECK-NEXT: sltu $a2, $a0, $a1 +; CHECK-NEXT: sltu $a0, $a1, $a0 +; CHECK-NEXT: sub.d $a0, $a0, $a2 +; CHECK-NEXT: ret + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir b/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir index d44ba086c74351..0ee7a5341c23d9 100644 --- a/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir +++ b/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -run-pass machineverifier -o - %s 2>&1 | FileCheck %s +# RUN: not --crash llc --passes='machine-function(verify)' -o - %s 2>&1 | FileCheck %s # CHECK: Bad machine code: Missing DebugLoc for debug instruction # CHECK: - instruction: DBG_VALUE 1, 2, 3, 4 diff --git a/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir b/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir index d2886d0fff31ff..e9d6156a0eae90 100644 --- a/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir +++ b/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir @@ -1,4 +1,5 @@ # RUN: llc -march=x86-64 -run-pass machineverifier -o - %s | FileCheck %s +# RUN: llc -march=x86-64 --passes='machine-function(verify)' -o - %s | FileCheck %s # Simple round-trip test for DBG_VALUE_LIST. # CHECK: [[VAR_C:![0-9]+]] = !DILocalVariable(name: "c" # CHECK: DBG_VALUE_LIST [[VAR_C]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $edi, $esi, debug-location diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll new file mode 100644 index 00000000000000..7cdced1778a537 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -0,0 +1,951 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} + +; CHECK-LABEL: generic_plain +define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { + ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr %a + + ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr %b + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr %c + + ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr %d + + ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr %c + + ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr %c + + ret void +} + +; CHECK-LABEL: generic_volatile +define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr %a + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr %b + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr %c + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr %d + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr %c + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr %c + + ret void +} + +; CHECK-LABEL: generic_monotonic +define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel +define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e release, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile +define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e monotonic, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_plain +define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_volatile +define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_monotonic +define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile +define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel +define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile +define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 + + ret void +} + +;; shared statespace + +; CHECK-LABEL: shared_plain +define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_volatile +define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_monotonic +define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile +define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel +define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile +define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_plain +define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_volatile +define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_monotonic +define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel +define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile +define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 0955b433e0f768..27065f5eca9f48 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,8 +1,10 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; CHECK-LABEL: plain -define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; generic statespace + +; CHECK-LABEL: generic_plain +define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 @@ -27,11 +29,23 @@ define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store i64 %d.add, ptr %d + ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr %c + + ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr %c + ret void } -; CHECK-LABEL: volatile -define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; CHECK-LABEL: generic_volatile +define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr %a %a.add = add i8 %a.load, 1 @@ -56,11 +70,23 @@ define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store volatile i64 %d.add, ptr %d + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr %c + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr %c + ret void } -; CHECK-LABEL: monotonic -define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic +define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -91,5 +117,550 @@ define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_add ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e monotonic, align 4 + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile +define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e monotonic, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_plain +define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_volatile +define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(1) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(1) %a + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(1) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(1) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(1) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(1) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(1) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(1) %c + + ret void +} + +; CHECK-LABEL: global_monotonic +define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile +define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + + ret void +} + +;; shared statespace + +; CHECK-LABEL: shared_plain +define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_volatile +define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(3) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(3) %a + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(3) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(3) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(3) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(3) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(3) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(3) %c + + ret void +} + +; CHECK-LABEL: shared_monotonic +define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile +define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_plain +define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_volatile +define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load volatile i8, ptr addrspace(5) %a + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i8 %a.add, ptr addrspace(5) %a + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load volatile i16, ptr addrspace(5) %b + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store volatile i16 %b.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load volatile i32, ptr addrspace(5) %c + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile i32 %c.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load volatile i64, ptr addrspace(5) %d + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store volatile i64 %d.add, ptr addrspace(5) %d + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load volatile float, ptr addrspace(5) %c + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store volatile float %e.add, ptr addrspace(5) %c + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load volatile double, ptr addrspace(5) %c + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store volatile double %f.add, ptr addrspace(5) %c + + ret void +} + +; CHECK-LABEL: local_monotonic +define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + ret void } diff --git a/llvm/test/CodeGen/NVPTX/rcp-opt.ll b/llvm/test/CodeGen/NVPTX/rcp-opt.ll new file mode 100644 index 00000000000000..e2443c27e8490a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/rcp-opt.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +;; Check if fneg (fdiv 1, X) lowers to fneg (rcp.rn X). + +define double @test1(double %in) { +; CHECK-LABEL: test1( +; CHECK: { +; CHECK-NEXT: .reg .f64 %fd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f64 %fd1, [test1_param_0]; +; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; +; CHECK-NEXT: neg.f64 %fd3, %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: ret; + %div = fdiv double 1.000000e+00, %in + %neg = fsub double -0.000000e+00, %div + ret double %neg +} + +;; Check if fdiv -1, X lowers to fneg (rcp.rn X). + +define double @test2(double %in) { +; CHECK-LABEL: test2( +; CHECK: { +; CHECK-NEXT: .reg .f64 %fd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f64 %fd1, [test2_param_0]; +; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; +; CHECK-NEXT: neg.f64 %fd3, %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: ret; + %div = fdiv double -1.000000e+00, %in + ret double %div +} + +;; Check if fdiv 1, (fneg X) lowers to fneg (rcp.rn X). + +define double @test3(double %in) { +; CHECK-LABEL: test3( +; CHECK: { +; CHECK-NEXT: .reg .f64 %fd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f64 %fd1, [test3_param_0]; +; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; +; CHECK-NEXT: neg.f64 %fd3, %fd2; +; CHECK-NEXT: st.param.f64 [func_retval0+0], %fd3; +; CHECK-NEXT: ret; + %neg = fsub double -0.000000e+00, %in + %div = fdiv double 1.000000e+00, %neg + ret double %div +} diff --git a/llvm/test/CodeGen/PowerPC/peephole-combineRLWINM-liveness.mir b/llvm/test/CodeGen/PowerPC/peephole-combineRLWINM-liveness.mir new file mode 100644 index 00000000000000..a5714f20f77f88 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/peephole-combineRLWINM-liveness.mir @@ -0,0 +1,27 @@ +# RUN: llc -mtriple=powerpc-ibm-aix -verify-machineinstrs -run-pass=ppc-mi-peepholes -o - %s | FileCheck %s +# RUN: llc -mtriple=powerpc64-ibm-aix -verify-machineinstrs -run-pass=ppc-mi-peepholes -o - %s | FileCheck %s +# RUN: llc -mtriple=powerpc64-linux-gnu -verify-machineinstrs -run-pass=ppc-mi-peepholes -o - %s | FileCheck %s +--- + +name: testFoldRLWINM +tracksRegLiveness: true + +body: | + bb.0.entry: + liveins: $r3 + %0:gprc = COPY $r3 + B %bb.1 + bb.1: + B %bb.2 + bb.2: + %1:gprc = RLWINM killed %0:gprc, 1, 0, 30 + %2:gprc = RLWINM killed %1:gprc, 31, 0, 0 + BLR8 implicit $lr8, implicit $rm + +... + +# CHECK-LABEL: testFoldRLWINM +# CHECK: bb.0.entry: +# CHECK: dead %0:gprc = COPY killed $r3 +# CHECK: bb.2: +# CHECK: dead %2:gprc = LI 0 diff --git a/llvm/test/CodeGen/PowerPC/scmp.ll b/llvm/test/CodeGen/PowerPC/scmp.ll new file mode 100644 index 00000000000000..107137c0bea7c6 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/scmp.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=ppc64le-unknown-unknown %s -o - | FileCheck %s + +define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp_8_8: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpw 3, 4 +; CHECK-NEXT: sub 5, 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp_8_16: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpw 3, 4 +; CHECK-NEXT: sub 5, 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_8_32: +; CHECK: # %bb.0: +; CHECK-NEXT: extsw 4, 4 +; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: cmpw 3, 4 +; CHECK-NEXT: sub 3, 4, 3 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: blr + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_8_64: +; CHECK: # %bb.0: +; CHECK-NEXT: sradi 5, 4, 63 +; CHECK-NEXT: rldicl 6, 3, 1, 63 +; CHECK-NEXT: subc 7, 4, 3 +; CHECK-NEXT: adde 5, 6, 5 +; CHECK-NEXT: cmpd 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: xori 5, 5, 1 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp_8_128: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpld 4, 6 +; CHECK-NEXT: cmpd 1, 4, 6 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: cmpld 5, 3, 5 +; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: crandc 22, 5, 2 +; CHECK-NEXT: crand 21, 2, 21 +; CHECK-NEXT: crand 20, 2, 20 +; CHECK-NEXT: crnor 21, 21, 22 +; CHECK-NEXT: isel 3, 0, 3, 21 +; CHECK-NEXT: crandc 21, 4, 2 +; CHECK-NEXT: cror 20, 20, 21 +; CHECK-NEXT: isel 3, 4, 3, 20 +; CHECK-NEXT: blr + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_32_32: +; CHECK: # %bb.0: +; CHECK-NEXT: extsw 4, 4 +; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: cmpw 3, 4 +; CHECK-NEXT: sub 3, 4, 3 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: blr + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_32_64: +; CHECK: # %bb.0: +; CHECK-NEXT: sradi 5, 4, 63 +; CHECK-NEXT: rldicl 6, 3, 1, 63 +; CHECK-NEXT: subc 7, 4, 3 +; CHECK-NEXT: adde 5, 6, 5 +; CHECK-NEXT: cmpd 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: xori 5, 5, 1 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_64_64: +; CHECK: # %bb.0: +; CHECK-NEXT: sradi 5, 4, 63 +; CHECK-NEXT: rldicl 6, 3, 1, 63 +; CHECK-NEXT: subc 7, 4, 3 +; CHECK-NEXT: adde 5, 6, 5 +; CHECK-NEXT: cmpd 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: xori 5, 5, 1 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll new file mode 100644 index 00000000000000..d2dff6e7e05c89 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ucmp.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=ppc64le-unknown-unknown %s -o - | FileCheck %s + +define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_8: +; CHECK: # %bb.0: +; CHECK-NEXT: cmplw 3, 4 +; CHECK-NEXT: sub 5, 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: rldic 3, 3, 0, 32 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_16: +; CHECK: # %bb.0: +; CHECK-NEXT: cmplw 3, 4 +; CHECK-NEXT: sub 5, 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: rldic 3, 3, 0, 32 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_8_32: +; CHECK: # %bb.0: +; CHECK-NEXT: clrldi 5, 4, 32 +; CHECK-NEXT: clrldi 6, 3, 32 +; CHECK-NEXT: sub 5, 5, 6 +; CHECK-NEXT: cmplw 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldic 3, 3, 0, 32 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_8_64: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: subc 3, 4, 3 +; CHECK-NEXT: subfe 3, 4, 4 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: neg 3, 3 +; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: blr + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp_8_128: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpld 4, 6 +; CHECK-NEXT: cmpld 1, 3, 5 +; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: crandc 20, 1, 2 +; CHECK-NEXT: crand 21, 2, 5 +; CHECK-NEXT: crnor 20, 21, 20 +; CHECK-NEXT: crand 21, 2, 4 +; CHECK-NEXT: isel 3, 0, 3, 20 +; CHECK-NEXT: crandc 20, 0, 2 +; CHECK-NEXT: cror 20, 21, 20 +; CHECK-NEXT: isel 3, 4, 3, 20 +; CHECK-NEXT: blr + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_32_32: +; CHECK: # %bb.0: +; CHECK-NEXT: clrldi 5, 4, 32 +; CHECK-NEXT: clrldi 6, 3, 32 +; CHECK-NEXT: sub 5, 5, 6 +; CHECK-NEXT: cmplw 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: rldic 3, 3, 0, 32 +; CHECK-NEXT: rldicl 5, 5, 1, 63 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_32_64: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: subc 3, 4, 3 +; CHECK-NEXT: subfe 3, 4, 4 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: neg 3, 3 +; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: blr + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_64_64: +; CHECK: # %bb.0: +; CHECK-NEXT: subc 5, 4, 3 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: subfe 5, 4, 4 +; CHECK-NEXT: neg 5, 5 +; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: blr + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 5f82d757a22ec3..c9fe1059b13786 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -407,7 +407,7 @@ ; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zacas1p0" ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1" ; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0" -; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4_zicsr2p0" +; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp1p0_zicsr2p0" ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0" ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm1p0" ; RV32SMNPM: .attribute 5, "rv32i2p1_smnpm1p0" @@ -543,7 +543,7 @@ ; RV64ZVFBFWMA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0" ; RV64ZACAS: .attribute 5, "rv64i2p1_a2p1_zacas1p0" ; RV64ZALASR: .attribute 5, "rv64i2p1_zalasr0p1" -; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4_zicsr2p0" +; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp1p0_zicsr2p0" ; RV64ZABHA: .attribute 5, "rv64i2p1_a2p1_zabha1p0" ; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm1p0" ; RV64SMNPM: .attribute 5, "rv64i2p1_smnpm1p0" diff --git a/llvm/test/CodeGen/RISCV/pr94265.ll b/llvm/test/CodeGen/RISCV/pr94265.ll new file mode 100644 index 00000000000000..cb41e22381d19d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/pr94265.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32-- -mattr=+v | FileCheck -check-prefix=RV32I %s +; RUN: llc < %s -mtriple=riscv64-- -mattr=+v | FileCheck -check-prefix=RV64I %s + +define <8 x i16> @PR94265(<8 x i32> %a0) #0 { +; RV32I-LABEL: PR94265: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vsra.vi v10, v8, 31 +; RV32I-NEXT: vsrl.vi v10, v10, 26 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32I-NEXT: vnsrl.wi v10, v8, 6 +; RV32I-NEXT: vsll.vi v8, v10, 10 +; RV32I-NEXT: ret +; +; RV64I-LABEL: PR94265: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64I-NEXT: vsra.vi v10, v8, 31 +; RV64I-NEXT: vsrl.vi v10, v10, 26 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64I-NEXT: vnsrl.wi v10, v8, 6 +; RV64I-NEXT: vsll.vi v8, v10, 10 +; RV64I-NEXT: ret + %t1 = sdiv <8 x i32> %a0, + %t2 = trunc <8 x i32> %t1 to <8 x i16> + %t3 = shl <8 x i16> %t2, + ret <8 x i16> %t3 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 5e64e9fbc1a2f5..b8c7037580c46b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -1,10 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64 +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32) @@ -626,3 +632,60 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask } declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32) + +; TODO: Use accurate evl. +; Test unmasked integer zero strided +define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: +; CHECK-NO-OPT: # %bb.0: +; CHECK-NO-OPT-NEXT: lbu a0, 0(a0) +; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0 +; CHECK-NO-OPT-NEXT: ret + %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3) + ret <4 x i8> %load +} + +; TODO: Use accurate evl. +; Test unmasked float zero strided +define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16: +; CHECK-NO-OPT: # %bb.0: +; CHECK-NO-OPT-NEXT: flh fa5, 0(a0) +; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5 +; CHECK-NO-OPT-NEXT: ret + %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3) + ret <4 x half> %load +} + +define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) { +; CHECK-RV32-LABEL: zero_strided_vadd.vx: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-RV32-NEXT: vlse64.v v10, (a0), zero +; CHECK-RV32-NEXT: vadd.vv v8, v8, v10 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: zero_strided_vadd.vx: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-RV64-NEXT: vadd.vx v8, v8, a0 +; CHECK-RV64-NEXT: ret + %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4) + %w = add <4 x i64> %v, %load + ret <4 x i64> %w +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index 05c7bd990642c5..afea1dc6d3c2a3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -663,3 +663,31 @@ define <16 x double> @vfwadd_wf_v16f32(ptr %x, float %y) { %e = fadd <16 x double> %d, %a ret <16 x double> %e } + +define <2 x float> @vfwadd_vf2_v2f32(<2 x half> %x, half %y) { +; CHECK-LABEL: vfwadd_vf2_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfwadd.vf v9, v8, fa0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %a = fpext <2 x half> %x to <2 x float> + %b = fpext half %y to float + %c = insertelement <2 x float> poison, float %b, i32 0 + %d = shufflevector <2 x float> %c, <2 x float> poison, <2 x i32> zeroinitializer + %e = fadd <2 x float> %a, %d + ret <2 x float> %e +} + +define <2 x float> @vfwadd_wf2_v2f32(<2 x float> %x, half %y) { +; CHECK-LABEL: vfwadd_wf2_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: ret + %b = fpext half %y to float + %c = insertelement <2 x float> poison, float %b, i32 0 + %d = shufflevector <2 x float> %c, <2 x float> poison, <2 x i32> zeroinitializer + %e = fadd <2 x float> %x, %d + ret <2 x float> %e +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 5a57801d33b40d..319994d2655651 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -448,3 +448,18 @@ define <2 x double> @vfwmul_squared_v2f16_v2f64(ptr %x) { %c = fmul <2 x double> %b, %b ret <2 x double> %c } + +define <2 x float> @vfwmul_vf2_v2f32(<2 x half> %x, half %y) { +; CHECK-LABEL: vfwmul_vf2_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfwmul.vf v9, v8, fa0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %a = fpext <2 x half> %x to <2 x float> + %b = fpext half %y to float + %c = insertelement <2 x float> poison, float %b, i32 0 + %d = shufflevector <2 x float> %c, <2 x float> poison, <2 x i32> zeroinitializer + %e = fmul <2 x float> %a, %d + ret <2 x float> %e +} diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index 4d3bced0bcb50f..0010f64a93fd62 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -1,10 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV32 +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV64 +; RUN: -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, , i32) @@ -780,3 +786,59 @@ define @strided_load_nxv17f64(ptr %ptr, i64 %stride, @llvm.experimental.vp.strided.load.nxv17f64.p0.i64(ptr, i64, , i32) declare @llvm.experimental.vector.extract.nxv1f64( %vec, i64 %idx) declare @llvm.experimental.vector.extract.nxv16f64( %vec, i64 %idx) + +; Test unmasked integer zero strided +define @zero_strided_unmasked_vpload_nxv1i8_i8(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8: +; CHECK-NO-OPT: # %bb.0: +; CHECK-NO-OPT-NEXT: lbu a0, 0(a0) +; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0 +; CHECK-NO-OPT-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 0, splat (i1 true), i32 4) + ret %load +} + +; Test unmasked float zero strided +define @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16: +; CHECK-NO-OPT: # %bb.0: +; CHECK-NO-OPT-NEXT: flh fa5, 0(a0) +; CHECK-NO-OPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5 +; CHECK-NO-OPT-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, splat (i1 true), i32 4) + ret %load +} + +define @zero_strided_vadd.vx( %v, ptr %ptr) { +; CHECK-RV32-LABEL: zero_strided_vadd.vx: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vlse64.v v9, (a0), zero +; CHECK-RV32-NEXT: vadd.vv v8, v8, v9 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: zero_strided_vadd.vx: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-RV64-NEXT: vadd.vx v8, v8, a0 +; CHECK-RV64-NEXT: ret + %vscale = call i32 @llvm.vscale() + %load = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 0, splat (i1 true), i32 %vscale) + %w = add %v, %load + ret %w +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll index 1ae20c37d11e3b..44b152126942cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -1126,12 +1126,13 @@ exit: ret void } -; Check that we don't forward an AVL if we wouldn't be able to extend its -; LiveInterval without clobbering other val nos. -define @unforwardable_avl(i64 %n, %v, i1 %cmp) { -; CHECK-LABEL: unforwardable_avl: +; Check that if we forward an AVL whose value is clobbered in its LiveInterval +; we emit a copy instead. +define @clobbered_forwarded_avl(i64 %n, %v, i1 %cmp) { +; CHECK-LABEL: clobbered_forwarded_avl: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli a2, a0, e32, m2, ta, ma +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: .LBB27_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir index 8956ecd2a8bbfd..bf93f5cc1f6f2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir @@ -134,7 +134,11 @@ ret void } - define void @unforwardable_avl() { + define void @clobberred_forwarded_avl() { + ret void + } + + define void @clobberred_forwarded_phi_avl() { ret void } @@ -995,16 +999,17 @@ body: | PseudoBR %bb.1 ... --- -name: unforwardable_avl +name: clobberred_forwarded_avl tracksRegLiveness: true body: | - ; CHECK-LABEL: name: unforwardable_avl + ; CHECK-LABEL: name: clobberred_forwarded_avl ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $x10, $v8m2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %avl:gprnox0 = COPY $x10 - ; CHECK-NEXT: %outvl:gprnox0 = PseudoVSETVLI %avl, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY %avl + ; CHECK-NEXT: dead %outvl:gprnox0 = PseudoVSETVLI %avl, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -1017,7 +1022,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, renamable $v8m2, renamable $v8m2, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: dead $x0 = PseudoVSETVLI %outvl, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, renamable $v8m2, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoRET implicit $v8m2 bb.0: @@ -1034,3 +1039,63 @@ body: | renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, renamable $v8m2, renamable $v8m2, -1, 5, 0 renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, killed renamable $v8m2, %outvl:gprnox0, 5, 0 PseudoRET implicit $v8m2 +... +--- +name: clobberred_forwarded_phi_avl +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: clobberred_forwarded_phi_avl + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $x10, $x11, $v8m2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %v:vrm2 = COPY $v8m2 + ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gprnox0 = ADDI $x0, 1 + ; CHECK-NEXT: %x:gpr = COPY $x10 + ; CHECK-NEXT: %y:gpr = COPY $x11 + ; CHECK-NEXT: BEQ %x, %y, %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gprnox0 = ADDI $x0, 2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY [[ADDI]] + ; CHECK-NEXT: dead %outvl:gprnox0 = PseudoVSETVLI [[ADDI]], 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[ADDI:%[0-9]+]]:gprnox0 = ADDI [[ADDI]], 1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, %v, %v, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, %v, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET implicit $v8m2 + bb.0: + liveins: $x10, $x11, $v8m2 + %v:vrm2 = COPY $v8m2 + %a:gpr = ADDI $x0, 1 + %x:gpr = COPY $x10 + %y:gpr = COPY $x11 + BEQ %x, %y, %bb.2 + + bb.1: + %b:gpr = ADDI $x0, 2 + + bb.2: + %avl:gprnox0 = PHI %a, %bb.0, %b, %bb.1 + %outvl:gprnox0 = PseudoVSETVLI %avl:gprnox0, 209, implicit-def dead $vl, implicit-def dead $vtype + + bb.3: + %avl:gprnox0 = ADDI %avl:gprnox0, 1 + + bb.4: + renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, %v, %v, -1, 5, 0 + renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, killed %v, %outvl:gprnox0, 5, 0 + PseudoRET implicit $v8m2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index deff36835a84ef..a9cf741b1cb2ab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -589,7 +589,6 @@ body: | ; CHECK-LABEL: name: coalesce_shrink_removed_vsetvlis_uses ; CHECK: liveins: $x10, $v8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %avl1:gprnox0 = ADDI $x0, 1 ; CHECK-NEXT: %avl2:gprnox0 = ADDI $x0, 2 ; CHECK-NEXT: dead $x0 = PseudoVSETVLI %avl2, 209 /* e32, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %x:gpr = COPY $x10 diff --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll new file mode 100644 index 00000000000000..e79b6989410a6c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/scmp.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32I +; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64I + +define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { +; RV32I-LABEL: scmp.8.8: +; RV32I: # %bb.0: +; RV32I-NEXT: slt a2, a0, a1 +; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.8.8: +; RV64I: # %bb.0: +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { +; RV32I-LABEL: scmp.8.16: +; RV32I: # %bb.0: +; RV32I-NEXT: slt a2, a0, a1 +; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.8.16: +; RV64I: # %bb.0: +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: scmp.8.32: +; RV32I: # %bb.0: +; RV32I-NEXT: slt a2, a0, a1 +; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.8.32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: scmp.8.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a3 +; RV32I-NEXT: slt a0, a3, a1 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.8.64: +; RV64I: # %bb.0: +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { +; RV32I-LABEL: scmp.8.128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a6, 12(a0) +; RV32I-NEXT: lw a7, 8(a0) +; RV32I-NEXT: beq a6, a5, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt t2, a6, a5 +; RV32I-NEXT: j .LBB4_3 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: .LBB4_3: +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: lw t0, 0(a0) +; RV32I-NEXT: beq a3, a2, .LBB4_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: sltu a0, a3, a2 +; RV32I-NEXT: j .LBB4_6 +; RV32I-NEXT: .LBB4_5: +; RV32I-NEXT: sltu a0, t0, a1 +; RV32I-NEXT: .LBB4_6: +; RV32I-NEXT: xor t1, a6, a5 +; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: beqz t1, .LBB4_8 +; RV32I-NEXT: # %bb.7: +; RV32I-NEXT: mv a0, t2 +; RV32I-NEXT: .LBB4_8: +; RV32I-NEXT: beq a6, a5, .LBB4_11 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: slt a4, a5, a6 +; RV32I-NEXT: bne a3, a2, .LBB4_12 +; RV32I-NEXT: .LBB4_10: +; RV32I-NEXT: sltu a1, a1, t0 +; RV32I-NEXT: bnez t1, .LBB4_13 +; RV32I-NEXT: j .LBB4_14 +; RV32I-NEXT: .LBB4_11: +; RV32I-NEXT: sltu a4, a4, a7 +; RV32I-NEXT: beq a3, a2, .LBB4_10 +; RV32I-NEXT: .LBB4_12: +; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: beqz t1, .LBB4_14 +; RV32I-NEXT: .LBB4_13: +; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: .LBB4_14: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.8.128: +; RV64I: # %bb.0: +; RV64I-NEXT: beq a1, a3, .LBB4_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: slt a4, a1, a3 +; RV64I-NEXT: slt a0, a3, a1 +; RV64I-NEXT: sub a0, a0, a4 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB4_2: +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sltu a0, a2, a0 +; RV64I-NEXT: sub a0, a0, a4 +; RV64I-NEXT: ret + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: scmp.32.32: +; RV32I: # %bb.0: +; RV32I-NEXT: slt a2, a0, a1 +; RV32I-NEXT: slt a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.32.32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: scmp.32.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a3 +; RV32I-NEXT: slt a0, a3, a1 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.32.64: +; RV64I: # %bb.0: +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: scmp.64.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a3 +; RV32I-NEXT: slt a0, a3, a1 +; RV32I-NEXT: j .LBB7_3 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: ret +; +; RV64I-LABEL: scmp.64.64: +; RV64I: # %bb.0: +; RV64I-NEXT: slt a2, a0, a1 +; RV64I-NEXT: slt a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll new file mode 100644 index 00000000000000..026340ede1f908 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ucmp.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32I +; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64I + +define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { +; RV32I-LABEL: ucmp.8.8: +; RV32I: # %bb.0: +; RV32I-NEXT: sltu a2, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.8.8: +; RV64I: # %bb.0: +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { +; RV32I-LABEL: ucmp.8.16: +; RV32I: # %bb.0: +; RV32I-NEXT: sltu a2, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.8.16: +; RV64I: # %bb.0: +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: ucmp.8.32: +; RV32I: # %bb.0: +; RV32I-NEXT: sltu a2, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.8.32: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: ucmp.8.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB3_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a3 +; RV32I-NEXT: sltu a0, a3, a1 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB3_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.8.64: +; RV64I: # %bb.0: +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { +; RV32I-LABEL: ucmp.8.128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a6, 12(a0) +; RV32I-NEXT: lw a7, 8(a0) +; RV32I-NEXT: beq a6, a5, .LBB4_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu t2, a6, a5 +; RV32I-NEXT: j .LBB4_3 +; RV32I-NEXT: .LBB4_2: +; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: .LBB4_3: +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: lw t0, 0(a0) +; RV32I-NEXT: beq a3, a2, .LBB4_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: sltu a0, a3, a2 +; RV32I-NEXT: j .LBB4_6 +; RV32I-NEXT: .LBB4_5: +; RV32I-NEXT: sltu a0, t0, a1 +; RV32I-NEXT: .LBB4_6: +; RV32I-NEXT: xor t1, a6, a5 +; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: beqz t1, .LBB4_8 +; RV32I-NEXT: # %bb.7: +; RV32I-NEXT: mv a0, t2 +; RV32I-NEXT: .LBB4_8: +; RV32I-NEXT: beq a6, a5, .LBB4_11 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: sltu a4, a5, a6 +; RV32I-NEXT: bne a3, a2, .LBB4_12 +; RV32I-NEXT: .LBB4_10: +; RV32I-NEXT: sltu a1, a1, t0 +; RV32I-NEXT: bnez t1, .LBB4_13 +; RV32I-NEXT: j .LBB4_14 +; RV32I-NEXT: .LBB4_11: +; RV32I-NEXT: sltu a4, a4, a7 +; RV32I-NEXT: beq a3, a2, .LBB4_10 +; RV32I-NEXT: .LBB4_12: +; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: beqz t1, .LBB4_14 +; RV32I-NEXT: .LBB4_13: +; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: .LBB4_14: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.8.128: +; RV64I: # %bb.0: +; RV64I-NEXT: beq a1, a3, .LBB4_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sltu a4, a1, a3 +; RV64I-NEXT: sltu a0, a3, a1 +; RV64I-NEXT: sub a0, a0, a4 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB4_2: +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sltu a0, a2, a0 +; RV64I-NEXT: sub a0, a0, a4 +; RV64I-NEXT: ret + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { +; RV32I-LABEL: ucmp.32.32: +; RV32I: # %bb.0: +; RV32I-NEXT: sltu a2, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.32.32: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: ucmp.32.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a3 +; RV32I-NEXT: sltu a0, a3, a1 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.32.64: +; RV64I: # %bb.0: +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { +; RV32I-LABEL: ucmp.64.64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a3 +; RV32I-NEXT: sltu a0, a3, a1 +; RV32I-NEXT: j .LBB7_3 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a0, a2, a0 +; RV32I-NEXT: .LBB7_3: +; RV32I-NEXT: sub a0, a0, a4 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ucmp.64.64: +; RV64I: # %bb.0: +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: sltu a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: ret + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/verify-instr.mir b/llvm/test/CodeGen/RISCV/verify-instr.mir index 622163659a9dd7..b6deed3af3f523 100644 --- a/llvm/test/CodeGen/RISCV/verify-instr.mir +++ b/llvm/test/CodeGen/RISCV/verify-instr.mir @@ -1,4 +1,5 @@ # RUN: not --crash llc -mtriple=riscv32 -run-pass machineverifier %s -o - 2>&1 | FileCheck %s +# RUN: not --crash llc -mtriple=riscv32 --passes='machine-function(verify)' %s -o - 2>&1 | FileCheck %s # CHECK: *** Bad machine code: Invalid immediate *** # CHECK: - instruction: $x2 = ADDI $x1, 10000 diff --git a/llvm/test/CodeGen/RISCV/xcvmac.ll b/llvm/test/CodeGen/RISCV/xcvmac.ll new file mode 100644 index 00000000000000..68efdf7210f7f5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xcvmac.ll @@ -0,0 +1,211 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m -mattr=+xcvmac -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +declare i32 @llvm.riscv.cv.mac.mac(i32, i32, i32) + +define i32 @test.mac(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.mac: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mac a2, a0, a1 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mac(i32 %a, i32 %b, i32 %c) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.msu(i32, i32, i32) + +define i32 @test.msu(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.msu: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.msu a2, a0, a1 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.msu(i32 %a, i32 %b, i32 %c) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.muluN(i32, i32, i32) + +define i32 @test.muluN(i32 %a, i32 %b) { +; CHECK-LABEL: test.muluN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulun a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.muluN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulhhuN(i32, i32, i32) + +define i32 @test.mulhhuN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulhhuN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulhhun a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulhhuN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulsN(i32, i32, i32) + +define i32 @test.mulsN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulsN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulsn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulsN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulhhsN(i32, i32, i32) + +define i32 @test.mulhhsN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulhhsN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulhhsn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulhhsN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.muluRN(i32, i32, i32) + +define i32 @test.muluRN(i32 %a, i32 %b) { +; CHECK-LABEL: test.muluRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulurn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.muluRN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulhhuRN(i32, i32, i32) + +define i32 @test.mulhhuRN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulhhuRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulhhurn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulhhuRN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulsRN(i32, i32, i32) + +define i32 @test.mulsRN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulsRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulsrn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulsRN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.mulhhsRN(i32, i32, i32) + +define i32 @test.mulhhsRN(i32 %a, i32 %b) { +; CHECK-LABEL: test.mulhhsRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.mulhhsrn a0, a0, a1, 5 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.mulhhsRN(i32 %a, i32 %b, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.macuN(i32, i32, i32, i32) + +define i32 @test.macuN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.macuN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.macun a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.macuN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.machhuN(i32, i32, i32, i32) + +define i32 @test.machhuN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.machhuN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.machhun a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.machhuN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.macsN(i32, i32, i32, i32) + +define i32 @test.macsN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.macsN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.macsn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.macsN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.machhsN(i32, i32, i32, i32) + +define i32 @test.machhsN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.machhsN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.machhsn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.machhsN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.macuRN(i32, i32, i32, i32) + +define i32 @test.macuRN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.macuRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.macurn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.macuRN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.machhuRN(i32, i32, i32, i32) + +define i32 @test.machhuRN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.machhuRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.machhurn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.machhuRN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.macsRN(i32, i32, i32, i32) + +define i32 @test.macsRN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.macsRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.macsrn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.macsRN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} + +declare i32 @llvm.riscv.cv.mac.machhsRN(i32, i32, i32, i32) + +define i32 @test.machhsRN(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test.machhsRN: +; CHECK: # %bb.0: +; CHECK-NEXT: cv.machhsrn a2, a0, a1, 5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: ret + %1 = call i32 @llvm.riscv.cv.mac.machhsRN(i32 %a, i32 %b, i32 %c, i32 5) + ret i32 %1 +} diff --git a/llvm/test/CodeGen/SystemZ/scmp.ll b/llvm/test/CodeGen/SystemZ/scmp.ll new file mode 100644 index 00000000000000..3ecaa60a58d245 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scmp.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp.8.8: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp.8.16: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.8.32: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.8.64: +; CHECK: # %bb.0: +; CHECK-NEXT: cgr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp.8.128: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochil %r2, 1 +; CHECK-NEXT: vecg %v1, %v0 +; CHECK-NEXT: jlh .LBB4_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: vchlgs %v0, %v0, %v1 +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.32.32: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.32.64: +; CHECK: # %bb.0: +; CHECK-NEXT: cgr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.64.64: +; CHECK: # %bb.0: +; CHECK-NEXT: cgr %r2, %r3 +; CHECK-NEXT: lghi %r2, 0 +; CHECK-NEXT: locghih %r2, 1 +; CHECK-NEXT: locghil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/SystemZ/ucmp.ll b/llvm/test/CodeGen/SystemZ/ucmp.ll new file mode 100644 index 00000000000000..4175cd7850a98d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/ucmp.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.8: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.16: +; CHECK: # %bb.0: +; CHECK-NEXT: cr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.8.32: +; CHECK: # %bb.0: +; CHECK-NEXT: clr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.8.64: +; CHECK: # %bb.0: +; CHECK-NEXT: clgr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp.8.128: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochil %r2, 1 +; CHECK-NEXT: veclg %v1, %v0 +; CHECK-NEXT: jlh .LBB4_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: vchlgs %v0, %v0, %v1 +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.32.32: +; CHECK: # %bb.0: +; CHECK-NEXT: clr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.32.64: +; CHECK: # %bb.0: +; CHECK-NEXT: clgr %r2, %r3 +; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lochih %r2, 1 +; CHECK-NEXT: lochil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.64.64: +; CHECK: # %bb.0: +; CHECK-NEXT: clgr %r2, %r3 +; CHECK-NEXT: lghi %r2, 0 +; CHECK-NEXT: locghih %r2, 1 +; CHECK-NEXT: locghil %r2, -1 +; CHECK-NEXT: br %r14 + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-08.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-08.ll new file mode 100644 index 00000000000000..19dc388f908fbc --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-08.ll @@ -0,0 +1,412 @@ +; Test v1i128 comparisons. +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test eq. +define <1 x i128> @f1(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vceqgs %v0, %v24, %v26 +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: ber %r14 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp eq <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test ne. +define <1 x i128> @f2(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vceqgs %v0, %v24, %v26 +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: bnher %r14 +; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp ne <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test sgt. +define <1 x i128> @f3(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v26, %v24 +; CHECK-NEXT: jlh .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test sge. +define <1 x i128> @f4(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v24, %v26 +; CHECK-NEXT: jlh .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp sge <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test sle. +define <1 x i128> @f5(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v26, %v24 +; CHECK-NEXT: jlh .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp sle <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test slt. +define <1 x i128> @f6(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v24, %v26 +; CHECK-NEXT: jlh .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test ugt. +define <1 x i128> @f7(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v26, %v24 +; CHECK-NEXT: jlh .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB6_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test uge. +define <1 x i128> @f8(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v24, %v26 +; CHECK-NEXT: jlh .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp uge <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test ule. +define <1 x i128> @f9(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v26, %v24 +; CHECK-NEXT: jlh .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB8_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp ule <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test ult. +define <1 x i128> @f10(<1 x i128> %val1, <1 x i128> %val2) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v24, %v26 +; CHECK-NEXT: jlh .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: vgbm %v24, 65535 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: br %r14 + %cmp = icmp ult <1 x i128> %val1, %val2 + %ret = sext <1 x i1> %cmp to <1 x i128> + ret <1 x i128> %ret +} + +; Test eq selects. +define <1 x i128> @f11(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vceqgs %v0, %v24, %v26 +; CHECK-NEXT: je .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp eq <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test ne selects. +define <1 x i128> @f12(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vceqgs %v0, %v24, %v26 +; CHECK-NEXT: jnhe .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp ne <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test sgt selects. +define <1 x i128> @f13(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v26, %v24 +; CHECK-NEXT: je .LBB12_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB12_4 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB12_3: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: jl .LBB12_2 +; CHECK-NEXT: .LBB12_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp sgt <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test sge selects. +define <1 x i128> @f14(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v24, %v26 +; CHECK-NEXT: je .LBB13_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB13_4 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: jnl .LBB13_2 +; CHECK-NEXT: .LBB13_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp sge <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test sle selects. +define <1 x i128> @f15(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f15: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v26, %v24 +; CHECK-NEXT: je .LBB14_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB14_4 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB14_3: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: jnl .LBB14_2 +; CHECK-NEXT: .LBB14_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp sle <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test slt selects. +define <1 x i128> @f16(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vecg %v24, %v26 +; CHECK-NEXT: je .LBB15_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB15_4 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: jl .LBB15_2 +; CHECK-NEXT: .LBB15_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp slt <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test ugt selects. +define <1 x i128> @f17(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f17: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v26, %v24 +; CHECK-NEXT: je .LBB16_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB16_4 +; CHECK-NEXT: .LBB16_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB16_3: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: jl .LBB16_2 +; CHECK-NEXT: .LBB16_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp ugt <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test uge selects. +define <1 x i128> @f18(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f18: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v24, %v26 +; CHECK-NEXT: je .LBB17_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB17_4 +; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: jnl .LBB17_2 +; CHECK-NEXT: .LBB17_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp uge <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test ule selects. +define <1 x i128> @f19(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f19: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v26, %v24 +; CHECK-NEXT: je .LBB18_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB18_4 +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB18_3: +; CHECK-NEXT: vchlgs %v0, %v24, %v26 +; CHECK-NEXT: jnl .LBB18_2 +; CHECK-NEXT: .LBB18_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp ule <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} + +; Test ult selects. +define <1 x i128> @f20(<1 x i128> %val1, <1 x i128> %val2, +; CHECK-LABEL: f20: +; CHECK: # %bb.0: +; CHECK-NEXT: veclg %v24, %v26 +; CHECK-NEXT: je .LBB19_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB19_4 +; CHECK-NEXT: .LBB19_2: +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB19_3: +; CHECK-NEXT: vchlgs %v0, %v26, %v24 +; CHECK-NEXT: jl .LBB19_2 +; CHECK-NEXT: .LBB19_4: +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: vlr %v24, %v28 +; CHECK-NEXT: br %r14 + <1 x i128> %val3, <1 x i128> %val4) { + %cmp = icmp ult <1 x i128> %val1, %val2 + %ret = select <1 x i1> %cmp, <1 x i128> %val3, <1 x i128> %val4 + ret <1 x i128> %ret +} diff --git a/llvm/test/CodeGen/Thumb/scmp.ll b/llvm/test/CodeGen/Thumb/scmp.ll new file mode 100644 index 00000000000000..661dbe97cdb3cd --- /dev/null +++ b/llvm/test/CodeGen/Thumb/scmp.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s + +define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp_8_8: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp_8_16: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_8_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_8_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp_8_128: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: add.w lr, sp, #16 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: ldm.w lr, {r9, r12, lr} +; CHECK-NEXT: subs.w r6, r0, r9 +; CHECK-NEXT: sbcs.w r6, r1, r12 +; CHECK-NEXT: sbcs.w r6, r2, lr +; CHECK-NEXT: sbcs.w r6, r3, r4 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: subs.w r0, r9, r0 +; CHECK-NEXT: sbcs.w r0, r12, r1 +; CHECK-NEXT: sbcs.w r0, lr, r2 +; CHECK-NEXT: sbcs.w r0, r4, r3 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: subs r0, r5, r6 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp_32_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_32_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp_64_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: bx lr + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/Thumb/ucmp.ll b/llvm/test/CodeGen/Thumb/ucmp.ll new file mode 100644 index 00000000000000..7e6d0a323b11c6 --- /dev/null +++ b/llvm/test/CodeGen/Thumb/ucmp.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s + +define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_8: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: it hi +; CHECK-NEXT: movhi r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp_8_16: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: it hi +; CHECK-NEXT: movhi r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_8_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: it hi +; CHECK-NEXT: movhi r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_8_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: bx lr + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp_8_128: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: add.w lr, sp, #16 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: ldm.w lr, {r9, r12, lr} +; CHECK-NEXT: subs.w r6, r0, r9 +; CHECK-NEXT: sbcs.w r6, r1, r12 +; CHECK-NEXT: sbcs.w r6, r2, lr +; CHECK-NEXT: sbcs.w r6, r3, r4 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r6, #1 +; CHECK-NEXT: subs.w r0, r9, r0 +; CHECK-NEXT: sbcs.w r0, r12, r1 +; CHECK-NEXT: sbcs.w r0, lr, r2 +; CHECK-NEXT: sbcs.w r0, r4, r3 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r5, #1 +; CHECK-NEXT: subs r0, r5, r6 +; CHECK-NEXT: pop {r4, r5, r6, pc} + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp_32_32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: it hi +; CHECK-NEXT: movhi r2, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_32_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: bx lr + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp_64_64: +; CHECK: @ %bb.0: +; CHECK-NEXT: subs.w r12, r0, r2 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: sbcs.w r12, r1, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r12, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo.w r9, #1 +; CHECK-NEXT: sub.w r0, r9, r12 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: bx lr + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/WebAssembly/scmp.ll b/llvm/test/CodeGen/WebAssembly/scmp.ll new file mode 100644 index 00000000000000..60ab6ef2f527a3 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/scmp.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -wasm-keep-registers | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind { +; CHECK-LABEL: scmp.8.8: +; CHECK: .functype scmp.8.8 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.scmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind { +; CHECK-LABEL: scmp.8.16: +; CHECK: .functype scmp.8.16 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.scmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.8.32: +; CHECK: .functype scmp.8.32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.scmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.8.64: +; CHECK: .functype scmp.8.64 (i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i64.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i64.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.scmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: scmp.8.128: +; CHECK: .functype scmp.8.128 (i64, i64, i64, i64) -> (i32) +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push10=, 0 +; CHECK-NEXT: local.get $push9=, 2 +; CHECK-NEXT: i64.gt_u $push4=, $pop10, $pop9 +; CHECK-NEXT: local.get $push12=, 1 +; CHECK-NEXT: local.get $push11=, 3 +; CHECK-NEXT: i64.gt_s $push3=, $pop12, $pop11 +; CHECK-NEXT: local.get $push14=, 1 +; CHECK-NEXT: local.get $push13=, 3 +; CHECK-NEXT: i64.eq $push8=, $pop14, $pop13 +; CHECK-NEXT: local.tee $push7=, 4, $pop8 +; CHECK-NEXT: i32.select $push5=, $pop4, $pop3, $pop7 +; CHECK-NEXT: local.get $push16=, 0 +; CHECK-NEXT: local.get $push15=, 2 +; CHECK-NEXT: i64.lt_u $push1=, $pop16, $pop15 +; CHECK-NEXT: local.get $push18=, 1 +; CHECK-NEXT: local.get $push17=, 3 +; CHECK-NEXT: i64.lt_s $push0=, $pop18, $pop17 +; CHECK-NEXT: local.get $push19=, 4 +; CHECK-NEXT: i32.select $push2=, $pop1, $pop0, $pop19 +; CHECK-NEXT: i32.sub $push6=, $pop5, $pop2 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.scmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: scmp.32.32: +; CHECK: .functype scmp.32.32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i32 @llvm.scmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.32.64: +; CHECK: .functype scmp.32.64 (i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i64.gt_s $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i64.lt_s $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i32 @llvm.scmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: scmp.64.64: +; CHECK: .functype scmp.64.64 (i64, i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push5=, 0 +; CHECK-NEXT: local.get $push4=, 1 +; CHECK-NEXT: i64.gt_s $push1=, $pop5, $pop4 +; CHECK-NEXT: local.get $push7=, 0 +; CHECK-NEXT: local.get $push6=, 1 +; CHECK-NEXT: i64.lt_s $push0=, $pop7, $pop6 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: i64.extend_i32_s $push3=, $pop2 +; CHECK-NEXT: # fallthrough-return + %1 = call i64 @llvm.scmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/WebAssembly/ucmp.ll b/llvm/test/CodeGen/WebAssembly/ucmp.ll new file mode 100644 index 00000000000000..ab7f9b2bab1dab --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ucmp.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -wasm-keep-registers | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.8: +; CHECK: .functype ucmp.8.8 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) + ret i8 %1 +} + +define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind { +; CHECK-LABEL: ucmp.8.16: +; CHECK: .functype ucmp.8.16 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) + ret i8 %1 +} + +define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.8.32: +; CHECK: .functype ucmp.8.32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + ret i8 %1 +} + +define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.8.64: +; CHECK: .functype ucmp.8.64 (i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i64.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i64.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) + ret i8 %1 +} + +define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { +; CHECK-LABEL: ucmp.8.128: +; CHECK: .functype ucmp.8.128 (i64, i64, i64, i64) -> (i32) +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push10=, 0 +; CHECK-NEXT: local.get $push9=, 2 +; CHECK-NEXT: i64.gt_u $push4=, $pop10, $pop9 +; CHECK-NEXT: local.get $push12=, 1 +; CHECK-NEXT: local.get $push11=, 3 +; CHECK-NEXT: i64.gt_u $push3=, $pop12, $pop11 +; CHECK-NEXT: local.get $push14=, 1 +; CHECK-NEXT: local.get $push13=, 3 +; CHECK-NEXT: i64.eq $push8=, $pop14, $pop13 +; CHECK-NEXT: local.tee $push7=, 4, $pop8 +; CHECK-NEXT: i32.select $push5=, $pop4, $pop3, $pop7 +; CHECK-NEXT: local.get $push16=, 0 +; CHECK-NEXT: local.get $push15=, 2 +; CHECK-NEXT: i64.lt_u $push1=, $pop16, $pop15 +; CHECK-NEXT: local.get $push18=, 1 +; CHECK-NEXT: local.get $push17=, 3 +; CHECK-NEXT: i64.lt_u $push0=, $pop18, $pop17 +; CHECK-NEXT: local.get $push19=, 4 +; CHECK-NEXT: i32.select $push2=, $pop1, $pop0, $pop19 +; CHECK-NEXT: i32.sub $push6=, $pop5, $pop2 +; CHECK-NEXT: # fallthrough-return + %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) + ret i8 %1 +} + +define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ucmp.32.32: +; CHECK: .functype ucmp.32.32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i32.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i32.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) + ret i32 %1 +} + +define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.32.64: +; CHECK: .functype ucmp.32.64 (i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push4=, 0 +; CHECK-NEXT: local.get $push3=, 1 +; CHECK-NEXT: i64.gt_u $push1=, $pop4, $pop3 +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: i64.lt_u $push0=, $pop6, $pop5 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: # fallthrough-return + %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) + ret i32 %1 +} + +define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ucmp.64.64: +; CHECK: .functype ucmp.64.64 (i64, i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push5=, 0 +; CHECK-NEXT: local.get $push4=, 1 +; CHECK-NEXT: i64.gt_u $push1=, $pop5, $pop4 +; CHECK-NEXT: local.get $push7=, 0 +; CHECK-NEXT: local.get $push6=, 1 +; CHECK-NEXT: i64.lt_u $push0=, $pop7, $pop6 +; CHECK-NEXT: i32.sub $push2=, $pop1, $pop0 +; CHECK-NEXT: i64.extend_i32_s $push3=, $pop2 +; CHECK-NEXT: # fallthrough-return + %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) + ret i64 %1 +} diff --git a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll deleted file mode 100644 index a82f705b77d848..00000000000000 --- a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll +++ /dev/null @@ -1,896 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+3dnow | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s --check-prefix=X64 - -define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone { -; X86-LABEL: test_pavgusb: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pavgusb %mm1, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl $4 -; -; X64-LABEL: test_pavgusb: -; X64: # %bb.0: # %entry -; X64-NEXT: pavgusb %mm1, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast x86_mmx %a.coerce to <8 x i8> - %1 = bitcast x86_mmx %b.coerce to <8 x i8> - %2 = bitcast <8 x i8> %0 to x86_mmx - %3 = bitcast <8 x i8> %1 to x86_mmx - %4 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %2, x86_mmx %3) - %5 = bitcast x86_mmx %4 to <8 x i8> - ret <8 x i8> %5 -} - -declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone - -define <2 x i32> @test_pf2id(<2 x float> %a) nounwind readnone { -; X86-LABEL: test_pf2id: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pf2id %mm1, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pf2id: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pf2id %mm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x i32> - ret <2 x i32> %2 -} - -declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone - -define <2 x float> @test_pfacc(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfacc: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfacc %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfacc: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfacc %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfadd(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfadd: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfadd %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfadd: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfadd %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone - -define <2 x i32> @test_pfcmpeq(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfcmpeq: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfcmpeq %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfcmpeq: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfcmpeq %mm0, %mm1 -; X64-NEXT: movq2dq %mm1, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x i32> - ret <2 x i32> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone - -define <2 x i32> @test_pfcmpge(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfcmpge: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfcmpge %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfcmpge: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfcmpge %mm0, %mm1 -; X64-NEXT: movq2dq %mm1, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x i32> - ret <2 x i32> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone - -define <2 x i32> @test_pfcmpgt(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfcmpgt: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfcmpgt %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfcmpgt: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfcmpgt %mm0, %mm1 -; X64-NEXT: movq2dq %mm1, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x i32> - ret <2 x i32> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfmax(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfmax: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfmax %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfmax: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfmax %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfmin(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfmin: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfmin %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfmin: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfmin %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfmul(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfmul: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfmul %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfmul: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfmul %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfrcp(<2 x float> %a) nounwind readnone { -; X86-LABEL: test_pfrcp: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pfrcp %mm1, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfrcp: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pfrcp %mm0, %mm0 -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x float> - ret <2 x float> %2 -} - -declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone - -define <2 x float> @test_pfrcpit1(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfrcpit1: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfrcpit1 %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfrcpit1: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfrcpit1 %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfrcpit2(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfrcpit2: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfrcpit2 %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfrcpit2: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfrcpit2 %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfrsqrt(<2 x float> %a) nounwind readnone { -; X86-LABEL: test_pfrsqrt: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pfrsqrt %mm1, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfrsqrt: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pfrsqrt %mm0, %mm0 -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x float> - ret <2 x float> %2 -} - -declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone - -define <2 x float> @test_pfrsqit1(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfrsqit1: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfrsqit1 %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfrsqit1: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfrsqit1 %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfsub(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfsub: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfsub %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfsub: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfsub %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfsubr(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfsubr: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfsubr %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfsubr: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfsubr %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone { -; X86-LABEL: test_pi2fd: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: pi2fd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pi2fd: -; X64: # %bb.0: # %entry -; X64-NEXT: pi2fd %mm0, %mm0 -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast x86_mmx %a.coerce to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone - -define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone { -; X86-LABEL: test_pmulhrw: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmulhrw %mm1, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl $4 -; -; X64-LABEL: test_pmulhrw: -; X64: # %bb.0: # %entry -; X64-NEXT: pmulhrw %mm1, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast x86_mmx %a.coerce to <4 x i16> - %1 = bitcast x86_mmx %b.coerce to <4 x i16> - %2 = bitcast <4 x i16> %0 to x86_mmx - %3 = bitcast <4 x i16> %1 to x86_mmx - %4 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %2, x86_mmx %3) - %5 = bitcast x86_mmx %4 to <4 x i16> - ret <4 x i16> %5 -} - -declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone - -define <2 x i32> @test_pf2iw(<2 x float> %a) nounwind readnone { -; X86-LABEL: test_pf2iw: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pf2iw %mm1, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pf2iw: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pf2iw %mm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x i32> - ret <2 x i32> %2 -} - -declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone - -define <2 x float> @test_pfnacc(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfnacc: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfnacc %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfnacc: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfnacc %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pfpnacc(<2 x float> %a, <2 x float> %b) nounwind readnone { -; X86-LABEL: test_pfpnacc: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 20(%ebp), %mm0 -; X86-NEXT: movd 16(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm2 -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: pfpnacc %mm1, %mm2 -; X86-NEXT: movq %mm2, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pfpnacc: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: movdq2q %xmm0, %mm1 -; X64-NEXT: pfpnacc %mm0, %mm1 -; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = bitcast <2 x float> %b to x86_mmx - %2 = tail call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %0, x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone - -define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone { -; X86-LABEL: test_pi2fw: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: pi2fw %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pi2fw: -; X64: # %bb.0: # %entry -; X64-NEXT: pi2fw %mm0, %mm0 -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast x86_mmx %a.coerce to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %1) - %3 = bitcast x86_mmx %2 to <2 x float> - ret <2 x float> %3 -} - -declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone - -define <2 x float> @test_pswapdsf(<2 x float> %a) nounwind readnone { -; X86-LABEL: test_pswapdsf: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0] -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds (%esp) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pswapdsf: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x float> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x float> - ret <2 x float> %2 -} - -define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone { -; X86-LABEL: test_pswapdsi: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 12(%ebp), %mm0 -; X86-NEXT: movd 8(%ebp), %mm1 -; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] -; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0] -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; X64-LABEL: test_pswapdsi: -; X64: # %bb.0: # %entry -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: retq -entry: - %0 = bitcast <2 x i32> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x i32> - ret <2 x i32> %2 -} - -declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index 8d8c1d26fc5cac..1ce10c3708d58f 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -1086,19 +1086,10 @@ define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %am ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX2-LABEL: combine_vec_shl_commuted_clamped1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: combine_vec_shl_commuted_clamped1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: combine_vec_shl_commuted_clamped1: +; AVX: # %bb.0: +; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %cmp.i = icmp uge <4 x i32> %amt, %shl = shl <4 x i32> %sh, %amt %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shl diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index f2a9aa217f7ec6..7bc90534dcc6e0 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -828,19 +828,10 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %a ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX2-LABEL: combine_vec_lshr_commuted_clamped1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: combine_vec_lshr_commuted_clamped1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: combine_vec_lshr_commuted_clamped1: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %cmp.i = icmp uge <4 x i32> %amt, %shr = lshr <4 x i32> %sh, %amt %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr diff --git a/llvm/test/CodeGen/X86/commute-3dnow.ll b/llvm/test/CodeGen/X86/commute-3dnow.ll deleted file mode 100644 index dc3910920365d5..00000000000000 --- a/llvm/test/CodeGen/X86/commute-3dnow.ll +++ /dev/null @@ -1,270 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X64 - -define void @commute_m_pfadd(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfadd: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pfadd (%eax), %mm0 -; X86-NEXT: pfadd (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfadd: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pfadd (%rsi), %mm0 -; X64-NEXT: pfadd (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) - -define void @commute_m_pfsub(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfsub: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pfsub (%eax), %mm0 -; X86-NEXT: pfsubr (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfsub: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pfsub (%rsi), %mm0 -; X64-NEXT: pfsubr (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) - -define void @commute_m_pfsubr(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfsubr: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pfsubr (%eax), %mm0 -; X86-NEXT: pfsub (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfsubr: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pfsubr (%rsi), %mm0 -; X64-NEXT: pfsub (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) - -define void @commute_m_pfmul(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfmul: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pfmul (%eax), %mm0 -; X86-NEXT: pfmul (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfmul: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pfmul (%rsi), %mm0 -; X64-NEXT: pfmul (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) - -; PFMAX can't commute without fast-math. -define void @commute_m_pfmax(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfmax: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: movq (%ecx), %mm1 -; X86-NEXT: pfmax (%eax), %mm0 -; X86-NEXT: pfmax %mm0, %mm1 -; X86-NEXT: movq %mm1, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfmax: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: movq (%rdx), %mm1 -; X64-NEXT: pfmax (%rsi), %mm0 -; X64-NEXT: pfmax %mm0, %mm1 -; X64-NEXT: movq %mm1, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) - -; PFMIN can't commute without fast-math. -define void @commute_m_pfmin(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfmin: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: movq (%ecx), %mm1 -; X86-NEXT: pfmin (%eax), %mm0 -; X86-NEXT: pfmin %mm0, %mm1 -; X86-NEXT: movq %mm1, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfmin: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: movq (%rdx), %mm1 -; X64-NEXT: pfmin (%rsi), %mm0 -; X64-NEXT: pfmin %mm0, %mm1 -; X64-NEXT: movq %mm1, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) - -define void @commute_m_pfcmpeq(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pfcmpeq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pfcmpeq (%eax), %mm0 -; X86-NEXT: pfcmpeq (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pfcmpeq: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pfcmpeq (%rsi), %mm0 -; X64-NEXT: pfcmpeq (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) - -define void @commute_m_pavgusb(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pavgusb: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pavgusb (%eax), %mm0 -; X86-NEXT: pavgusb (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pavgusb: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pavgusb (%rsi), %mm0 -; X64-NEXT: pavgusb (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) - -define void @commute_m_pmulhrw(ptr%a0, ptr%a1, ptr%a2) nounwind { -; X86-LABEL: commute_m_pmulhrw: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movq (%edx), %mm0 -; X86-NEXT: pmulhrw (%eax), %mm0 -; X86-NEXT: pmulhrw (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%ecx) -; X86-NEXT: retl -; -; X64-LABEL: commute_m_pmulhrw: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %mm0 -; X64-NEXT: pmulhrw (%rsi), %mm0 -; X64-NEXT: pmulhrw (%rdx), %mm0 -; X64-NEXT: movq %mm0, (%rdx) -; X64-NEXT: retq - %1 = load x86_mmx, ptr %a0 - %2 = load x86_mmx, ptr %a1 - %3 = load x86_mmx, ptr %a2 - %4 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %1, x86_mmx %2) - %5 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %3, x86_mmx %4) - store x86_mmx %5, ptr %a2 - ret void -} -declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) diff --git a/llvm/test/CodeGen/X86/distancemap.mir b/llvm/test/CodeGen/X86/distancemap.mir index b389a0c6cae70e..0a2f422302bd3a 100644 --- a/llvm/test/CodeGen/X86/distancemap.mir +++ b/llvm/test/CodeGen/X86/distancemap.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs | FileCheck %s +# RUN: llc %s -o - -mtriple=x86_64-unknown-linux --passes=two-address-instruction | FileCheck %s # In TwoAddressInstructionPass, new instructions should be added to DistanceMap. # In this case, function convertInstTo3Addr is called on the first ADD diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir index 800af1ce5432ea..559560ac20f8af 100644 --- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir +++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir @@ -1,36 +1,31 @@ -# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o - %s | FileCheck %s +# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+mmx -o - %s | FileCheck %s # This test verifies that the ExpandPostRA pass expands the GR64 <-> VR64 # copies into appropriate MMX_MOV instructions. --- | - define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone { + define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone { entry: %0 = bitcast <2 x i32> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0) + %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0) %2 = bitcast x86_mmx %1 to <2 x i32> ret <2 x i32> %2 } - declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone - ... --- -name: test_pswapdsi +name: test_paddw tracksRegLiveness: true body: | bb.0.entry: liveins: $xmm0 - - $xmm0 = PSHUFDri killed $xmm0, -24 - MOVPQI2QImr $rsp, 1, $noreg, -8, $noreg, killed $xmm0 - $mm0 = PSWAPDrm $rsp, 1, $noreg, -8, $noreg + $mm0 = MMX_MOVDQ2Qrr killed $xmm0 + $mm0 = MMX_PADDWrr killed $mm0, $mm0 + ; Inserted dummy copy here, for test: ; CHECK: $rax = MMX_MOVD64from64rr $mm0 ; CHECK-NEXT: $mm0 = MMX_MOVD64to64rr $rax $rax = COPY $mm0 $mm0 = COPY $rax - MMX_MOVQ64mr $rsp, 1, $noreg, -16, $noreg, killed $mm0 - $xmm0 = MOVQI2PQIrm $rsp, 1, $noreg, -16, $noreg - $xmm0 = PSHUFDri killed $xmm0, -44 - RET64 $xmm0 + $xmm0 = MMX_MOVQ2DQrr killed $mm0 + RET 0, $xmm0 ... diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll new file mode 100644 index 00000000000000..a7ceb4a4ee6fe7 --- /dev/null +++ b/llvm/test/CodeGen/X86/huge-stack.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 4 +; RUN: llc -O0 -mtriple=x86_64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK +%large = type [4294967295 x i8] + +define void @foo() unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: movabsq $8589934462, %rax # imm = 0x1FFFFFF7E +; CHECK-NEXT: subq %rax, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset -122 +; CHECK-NEXT: movb $42, -129(%rsp) +; CHECK-NEXT: movb $43, -128(%rsp) +; CHECK-NEXT: movabsq $8589934462, %rax # imm = 0x1FFFFFF7E +; CHECK-NEXT: addq %rax, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %1 = alloca %large, align 1 + %2 = alloca %large, align 1 + %3 = getelementptr inbounds %large, ptr %1, i64 0, i64 0 + store i8 42, ptr %3, align 1 + %4 = getelementptr inbounds %large, ptr %2, i64 0, i64 0 + store i8 43, ptr %4, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/inline-asm-memop.ll b/llvm/test/CodeGen/X86/inline-asm-memop.ll index 83442498076102..01fe2e4bd99a86 100644 --- a/llvm/test/CodeGen/X86/inline-asm-memop.ll +++ b/llvm/test/CodeGen/X86/inline-asm-memop.ll @@ -1,20 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s ; A bug in X86DAGToDAGISel::matchAddressRecursively create a zext SDValue which ; is quickly replaced by other SDValue but already pushed into vector for later ; calling for SelectionDAGISel::Select_INLINEASM getNode builder, see issue ; 82431 for more infomation. -define void @PR82431(i8 %call, ptr %b) { -; CHECK-LABEL: PR82431: +define i64 @PR82431_0(i8 %call, ptr %b) { +; CHECK-LABEL: PR82431_0: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movb %dil, %al -; CHECK-NEXT: addb $1, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: # kill: def $rax killed $eax -; CHECK-NEXT: shlq $3, %rax -; CHECK-NEXT: addq %rax, %rsi +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movq 8(%rsi,%rax,8), %rax +; CHECK-NEXT: retq +entry: + %narrow = add nuw i8 %call, 1 + %idxprom = zext i8 %narrow to i64 + %arrayidx = getelementptr [1 x i64], ptr %b, i64 0, i64 %idxprom + %ret_val = load i64, ptr %arrayidx + ret i64 %ret_val +} + +define i32 @PR82431_1(i32 %0, ptr %f) { +; CHECK-LABEL: PR82431_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: addl %edi, %edi +; CHECK-NEXT: andl $8, %edi +; CHECK-NEXT: movl 4(%rsi,%rdi), %eax +; CHECK-NEXT: retq +entry: + %shr = lshr i32 %0, 1 + %and = and i32 %shr, 2 + %add = or i32 %and, 1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr [0 x i32], ptr %f, i64 0, i64 %idxprom + %ret_val = load i32, ptr %arrayidx + ret i32 %ret_val +} + +define void @PR82431_2(i8 %call, ptr %b) { +; CHECK-LABEL: PR82431_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: retq @@ -25,3 +52,22 @@ entry: tail call void asm "", "=*m,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) %arrayidx, ptr elementtype(i64) %arrayidx) ret void } + +define void @PR82431_3(i32 %0, ptr %f) { +; CHECK-LABEL: PR82431_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: addl %edi, %edi +; CHECK-NEXT: andl $8, %edi +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: retq +entry: + %shr = lshr i32 %0, 1 + %and = and i32 %shr, 2 + %add = or i32 %and, 1 + %idxprom = zext i32 %add to i64 + %arrayidx = getelementptr [0 x i32], ptr %f, i64 0, i64 %idxprom + tail call void asm "", "=*m,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %arrayidx, ptr elementtype(i32) %arrayidx) + ret void +} diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll new file mode 100644 index 00000000000000..6376b4d599de72 --- /dev/null +++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s + +define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: hadd_select_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %and1 = and <4 x i32> %x, + %and2 = and <4 x i32> %y, + %hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2) + %cond = icmp ule <4 x i32> %hadd, + %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hadd + ret <4 x i32> %ret +} + +define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: hadd_trunc_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3] +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i16> %x, + %and2 = and <8 x i16> %y, + %hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2) + %conv = trunc <8 x i16> %hadd to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hadd_trunc_v8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %conv = trunc <8 x i32> %hadd to <8 x i16> + ret <8 x i16> %conv +} + +define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: hadd_trunc_v16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <16 x i16> %x, + %and2 = and <16 x i16> %y, + %hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2) + %conv = trunc <16 x i16> %hadd to <16 x i8> + ret <16 x i8> %conv +} + +define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: hsub_select_shl_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %or1 = or <4 x i32> %x, + %or2 = or <4 x i32> %y, + %hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2) + %shl = shl <4 x i32> %hsub, + %cond = icmp ule <4 x i32> %shl, + %ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hsub + ret <4 x i32> %ret +} + +define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: hsub_trunc_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %or1 = or <8 x i16> %x, + %or2 = or <8 x i16> %y, + %hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2) + %conv = trunc <8 x i16> %hsub to <8 x i8> + ret <8 x i8> %conv +} + +define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hsub_trunc_v8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %or1 = or <8 x i32> %x, + %or2 = or <8 x i32> %y, + %hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2) + %conv = trunc <8 x i32> %hsub to <8 x i16> + ret <8 x i16> %conv +} + +define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) { +; CHECK-LABEL: hsub_trunc_v16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %or1 = or <16 x i16> %x, + %or2 = or <16 x i16> %y, + %hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2) + %conv = trunc <16 x i16> %hsub to <16 x i8> + ret <16 x i8> %conv +} + +define <8 x i16> @hadd_extract_2st_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hadd_extract_2st_trunc_v8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %andr = and <8 x i32> %hadd, + %conv = trunc <8 x i32> %andr to <8 x i16> + ret <8 x i16> %conv +} + +define <8 x i16> @hadd_extract_8th_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hadd_extract_8th_trunc_v8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %andr = and <8 x i32> %hadd, + %conv = trunc <8 x i32> %andr to <8 x i16> + ret <8 x i16> %conv +} + +define <8 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %andr = and <8 x i32> %hadd, + %conv = trunc <8 x i32> %andr to <8 x i16> + ret <8 x i16> %conv +} + +define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] +; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and1 = and <8 x i32> %x, + %and2 = and <8 x i32> %y, + %hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2) + %andr = and <8 x i32> %hadd, + %conv = trunc <8 x i32> %andr to <8 x i16> + ret <8 x i16> %conv +} diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll index 4a79a109f8b603..b6022698edaeb9 100644 --- a/llvm/test/CodeGen/X86/pr35982.ll +++ b/llvm/test/CodeGen/X86/pr35982.ll @@ -46,50 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind { ret float %11 } -define float @PR35982_femms(<1 x i64>) nounwind { -; NO-POSTRA-LABEL: PR35982_femms: -; NO-POSTRA: # %bb.0: -; NO-POSTRA-NEXT: subl $8, %esp -; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax -; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] -; NO-POSTRA-NEXT: movd %mm0, %ecx -; NO-POSTRA-NEXT: femms -; NO-POSTRA-NEXT: movl %eax, (%esp) -; NO-POSTRA-NEXT: fildl (%esp) -; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: addl $8, %esp -; NO-POSTRA-NEXT: retl -; -; POSTRA-LABEL: PR35982_femms: -; POSTRA: # %bb.0: -; POSTRA-NEXT: subl $8, %esp -; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax -; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] -; POSTRA-NEXT: movd %mm0, %ecx -; POSTRA-NEXT: femms -; POSTRA-NEXT: movl %eax, (%esp) -; POSTRA-NEXT: fildl (%esp) -; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; POSTRA-NEXT: addl $8, %esp -; POSTRA-NEXT: retl - %2 = bitcast <1 x i64> %0 to <2 x i32> - %3 = extractelement <2 x i32> %2, i32 0 - %4 = extractelement <1 x i64> %0, i32 0 - %5 = bitcast i64 %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5) - %7 = bitcast x86_mmx %6 to <2 x i32> - %8 = extractelement <2 x i32> %7, i32 0 - tail call void @llvm.x86.mmx.femms() - %9 = sitofp i32 %3 to float - %10 = sitofp i32 %8 to float - %11 = fadd float %9, %10 - ret float %11 -} - declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) -declare void @llvm.x86.mmx.femms() declare void @llvm.x86.mmx.emms() diff --git a/llvm/test/CodeGen/X86/pr53247.ll b/llvm/test/CodeGen/X86/pr53247.ll index 2fc2ffb414e0e6..cb5e699c8da5e5 100644 --- a/llvm/test/CodeGen/X86/pr53247.ll +++ b/llvm/test/CodeGen/X86/pr53247.ll @@ -5,18 +5,12 @@ define i32 @PR53247(){ ; SSE-LABEL: PR53247: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: phaddd %xmm0, %xmm0 -; SSE-NEXT: phaddd %xmm0, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: PR53247: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: retq entry: %0 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll index c10e0526787d53..c3551644dfb7ff 100644 --- a/llvm/test/CodeGen/X86/prefetch.ll +++ b/llvm/test/CodeGen/X86/prefetch.ll @@ -6,16 +6,11 @@ ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE -; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW -; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW +; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE ; Rules: -; 3dnow by itself get you just the single prefetch instruction with no hints ; sse provides prefetch0/1/2/nta -; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint. -; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled - -; rdar://10538297 +; supporting prefetchw implicitly provides prefetcht0/1/2/nta as well, as we need something to fall back to for the non-write hint. define void @t(ptr %ptr) nounwind { ; X86-SSE-LABEL: t: @@ -43,19 +38,7 @@ define void @t(ptr %ptr) nounwind { ; X86-PRFCHWSSE-NEXT: prefetchw (%eax) ; X86-PRFCHWSSE-NEXT: prefetchw (%eax) ; X86-PRFCHWSSE-NEXT: retl -; -; X86-3DNOW-LABEL: t: -; X86-3DNOW: # %bb.0: # %entry -; X86-3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-3DNOW-NEXT: prefetch (%eax) -; X86-3DNOW-NEXT: prefetch (%eax) -; X86-3DNOW-NEXT: prefetch (%eax) -; X86-3DNOW-NEXT: prefetch (%eax) -; X86-3DNOW-NEXT: prefetchw (%eax) -; X86-3DNOW-NEXT: prefetchw (%eax) -; X86-3DNOW-NEXT: prefetchw (%eax) -; X86-3DNOW-NEXT: prefetchw (%eax) -; X86-3DNOW-NEXT: retl + entry: tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 1, i32 1 ) tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 2, i32 1 ) diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 55dc0d6059e051..7d4bbb06534e6c 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -5,24 +5,19 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind { ; X64-LABEL: scmp.8.8: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpb %sil, %dil -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp.8.8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB0_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB0_2: +; X86-NEXT: setl %cl +; X86-NEXT: setg %al +; X86-NEXT: subb %cl, %al ; X86-NEXT: retl %1 = call i8 @llvm.scmp(i8 %x, i8 %y) ret i8 %1 @@ -31,24 +26,19 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind { define i8 @scmp.8.16(i16 %x, i16 %y) nounwind { ; X64-LABEL: scmp.8.16: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpw %si, %di -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp.8.16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax -; X86-NEXT: setg %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: +; X86-NEXT: setl %cl +; X86-NEXT: setg %al +; X86-NEXT: subb %cl, %al ; X86-NEXT: retl %1 = call i8 @llvm.scmp(i16 %x, i16 %y) ret i8 %1 @@ -57,24 +47,19 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind { define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { ; X64-LABEL: scmp.8.32: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp.8.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB2_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB2_2: +; X86-NEXT: setl %cl +; X86-NEXT: setg %al +; X86-NEXT: subb %cl, %al ; X86-NEXT: retl %1 = call i8 @llvm.scmp(i32 %x, i32 %y) ret i8 %1 @@ -83,35 +68,32 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind { define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: scmp.8.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp.8.64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setl %cl -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB3_2: +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i8 @llvm.scmp(i64 %x, i64 %y) ret i8 %1 @@ -120,16 +102,14 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind { define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; X64-LABEL: scmp.8.128: ; X64: # %bb.0: +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: sbbq %rcx, %rax +; X64-NEXT: setl %r8b ; X64-NEXT: cmpq %rdi, %rdx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: sbbq %rsi, %rax +; X64-NEXT: sbbq %rsi, %rcx ; X64-NEXT: setl %al -; X64-NEXT: movzbl %al, %r8d -; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %r8d, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: subb %r8b, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp.8.128: @@ -142,26 +122,23 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: setl %cl ; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: sbbl %ebx, %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB4_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB4_2: +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: setl %al +; X86-NEXT: subb %cl, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -174,25 +151,21 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { ; X64-LABEL: scmp.32.32: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovgel %ecx, %eax +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax ; X64-NEXT: retq ; ; X86-LABEL: scmp.32.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %dl -; X86-NEXT: movl $-1, %eax -; X86-NEXT: jl .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %dl, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB5_2: +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movsbl %cl, %eax ; X86-NEXT: retl %1 = call i32 @llvm.scmp(i32 %x, i32 %y) ret i32 %1 @@ -201,34 +174,34 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind { define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: scmp.32.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovgel %ecx, %eax +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax ; X64-NEXT: retq ; ; X86-LABEL: scmp.32.64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setl %cl -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: jl .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: .LBB6_2: +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i32 @llvm.scmp(i64 %x, i64 %y) ret i32 %1 @@ -237,36 +210,36 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind { define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: scmp.64.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovgeq %rcx, %rax +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbq %cl, %rax ; X64-NEXT: retq ; ; X86-LABEL: scmp.64.64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setl %cl -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jl .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB7_2: +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i64 @llvm.scmp(i64 %x, i64 %y) ret i64 %1 @@ -275,24 +248,19 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind { define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind { ; X64-LABEL: scmp_narrow_result: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp_narrow_result: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB8_2: +; X86-NEXT: setl %cl +; X86-NEXT: setg %al +; X86-NEXT: subb %cl, %al ; X86-NEXT: retl %1 = call i4 @llvm.scmp(i32 %x, i32 %y) ret i4 %1 @@ -305,39 +273,36 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind { ; X64-NEXT: sarq $2, %rsi ; X64-NEXT: shlq $2, %rdi ; X64-NEXT: sarq $2, %rdi -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp_narrow_op: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $2, %eax ; X86-NEXT: sarl $2, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shll $2, %edi -; X86-NEXT: sarl $2, %edi -; X86-NEXT: cmpl %ecx, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: setl %dl -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: .LBB9_2: +; X86-NEXT: shll $2, %esi +; X86-NEXT: sarl $2, %esi +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i8 @llvm.scmp(i62 %x, i62 %y) ret i8 %1 @@ -346,39 +311,33 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind { define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind { ; X64-LABEL: scmp_wide_result: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovgeq %rcx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbq %cl, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: andl $8191, %ecx # imm = 0x1FFF ; X64-NEXT: retq ; ; X86-LABEL: scmp_wide_result: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; X86-NEXT: setg %bl -; X86-NEXT: movl $-1, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB10_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: movb %bl, %cl -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: .LBB10_2: -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl $0, 12(%eax) -; X86-NEXT: movl $0, 8(%eax) -; X86-NEXT: movw $0, 16(%eax) -; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %cl +; X86-NEXT: setg %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movsbl %dl, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: andl $8191, %ecx # imm = 0x1FFF +; X86-NEXT: movw %cx, 16(%eax) ; X86-NEXT: retl $4 %1 = call i141 @llvm.scmp(i32 %x, i32 %y) ret i141 %1 @@ -387,20 +346,18 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind { define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind { ; X64-LABEL: scmp_wide_op: ; X64: # %bb.0: -; X64-NEXT: shlq $19, %rsi -; X64-NEXT: sarq $19, %rsi ; X64-NEXT: shlq $19, %rcx ; X64-NEXT: sarq $19, %rcx +; X64-NEXT: shlq $19, %rsi +; X64-NEXT: sarq $19, %rsi +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: sbbq %rcx, %rax +; X64-NEXT: setl %r8b ; X64-NEXT: cmpq %rdi, %rdx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: sbbq %rsi, %rax +; X64-NEXT: sbbq %rsi, %rcx ; X64-NEXT: setl %al -; X64-NEXT: movzbl %al, %r8d -; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %r8d, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: subb %r8b, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp_wide_op: @@ -409,35 +366,31 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $19, %eax ; X86-NEXT: sarl $19, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $19, %ecx ; X86-NEXT: sarl $19, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ebp, %esi ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: setl {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: setl %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: sbbl %edi, %ebp ; X86-NEXT: sbbl %ecx, %eax -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB11_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: .LBB11_2: -; X86-NEXT: addl $4, %esp +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -454,34 +407,28 @@ define i41 @scmp_uncommon_types(i7 %x, i7 %y) nounwind { ; X64-NEXT: sarb %sil ; X64-NEXT: addb %dil, %dil ; X64-NEXT: sarb %dil -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovgeq %rcx, %rax +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbq %cl, %rax ; X64-NEXT: retq ; ; X86-LABEL: scmp_uncommon_types: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: sarb %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb %al, %dl -; X86-NEXT: setg %bl -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jl .LBB12_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movb %bl, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB12_2: -; X86-NEXT: popl %ebx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb %cl, %cl +; X86-NEXT: sarb %cl +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movsbl %cl, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: retl %1 = call i41 @llvm.scmp(i7 %x, i7 %y) ret i41 %1 @@ -494,38 +441,41 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movd %xmm2, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setg %dl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X64-NEXT: movdqa %xmm2, %xmm0 @@ -533,59 +483,41 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X86-LABEL: scmp_normal_vectors: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %edx -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jl .LBB13_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB13_2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB13_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: .LBB13_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setl %dl +; X86-NEXT: setg %dh +; X86-NEXT: subb %dl, %dh +; X86-NEXT: movsbl %dh, %edx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jl .LBB13_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB13_6: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %bl +; X86-NEXT: setg %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movsbl %bh, %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setl %bl +; X86-NEXT: setg %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: setg %cl -; X86-NEXT: jl .LBB13_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB13_8: -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %1 = call <4 x i32> @llvm.scmp(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %1 @@ -596,45 +528,45 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setg %dl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: movd %xmm2, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm2, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %edx, %esi -; X64-NEXT: setg %dil -; X64-NEXT: cmovll %eax, %edi -; X64-NEXT: movzbl %dil, %edx -; X64-NEXT: shll $8, %edx -; X64-NEXT: orl %ecx, %edx +; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: cmpl %ecx, %edx +; X64-NEXT: setl %cl +; X64-NEXT: setg %dl +; X64-NEXT: subb %cl, %dl +; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: shll $8, %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: movd %xmm2, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm2, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %ecx, %esi -; X64-NEXT: setg %dil -; X64-NEXT: cmovll %eax, %edi -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %edx, %ecx +; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: cmpl %eax, %edx +; X64-NEXT: setl %al +; X64-NEXT: setg %dl +; X64-NEXT: subb %al, %dl +; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: shll $16, %eax +; X64-NEXT: orl %ecx, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm1, %edx +; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm0, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %edx, %esi -; X64-NEXT: setg %dil -; X64-NEXT: cmovll %eax, %edi -; X64-NEXT: shll $24, %edi -; X64-NEXT: orl %ecx, %edi -; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: movd %xmm0, %edx +; X64-NEXT: cmpl %ecx, %edx +; X64-NEXT: setl %cl +; X64-NEXT: setg %dl +; X64-NEXT: subb %cl, %dl +; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: shll $24, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: scmp_narrow_vec_result: @@ -643,41 +575,29 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: setg %ch -; X86-NEXT: movb $-1, %dl -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jl .LBB14_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %ch, %cl -; X86-NEXT: .LBB14_2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %al -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jl .LBB14_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %ch -; X86-NEXT: .LBB14_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: setg %bl -; X86-NEXT: movb $-1, %dh -; X86-NEXT: jl .LBB14_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %bl, %dh -; X86-NEXT: .LBB14_6: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %ch +; X86-NEXT: setg %cl +; X86-NEXT: subb %ch, %cl ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: setl %ch ; X86-NEXT: setg %bl -; X86-NEXT: jl .LBB14_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %bl, %dl -; X86-NEXT: .LBB14_8: -; X86-NEXT: movb %dl, 3(%eax) -; X86-NEXT: movb %dh, 2(%eax) -; X86-NEXT: movb %ch, 1(%eax) +; X86-NEXT: subb %ch, %bl +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setl %ch +; X86-NEXT: setg %bh +; X86-NEXT: subb %ch, %bh +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setl %dl +; X86-NEXT: setg %ch +; X86-NEXT: subb %dl, %ch +; X86-NEXT: movb %ch, 3(%eax) +; X86-NEXT: movb %bh, 2(%eax) +; X86-NEXT: movb %bl, 1(%eax) ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -700,97 +620,82 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X64-NEXT: psrad $24, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setg %dl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm0 +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; X64-NEXT: retq ; ; X86-LABEL: scmp_narrow_vec_op: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %edx -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jl .LBB15_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB15_2: ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %bl -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB15_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %bl, %al -; X86-NEXT: movl %eax, %esi -; X86-NEXT: .LBB15_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: setl %dl +; X86-NEXT: setg %dh +; X86-NEXT: subb %dl, %dh +; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: setl %bl +; X86-NEXT: setg %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jl .LBB15_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB15_6: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %ch +; X86-NEXT: setg %bl +; X86-NEXT: subb %ch, %bl +; X86-NEXT: movsbl %bl, %edi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: jl .LBB15_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB15_8: -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %ecx +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %1 = call <4 x i32> @llvm.scmp(<4 x i8> %x, <4 x i8> %y) ret <4 x i32> %1 @@ -811,163 +716,178 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-NEXT: psrad $24, %xmm6 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setg %dl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm0 +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: movd %xmm7, %eax ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; X64-NEXT: movd %xmm7, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm7 +; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm7 ; X64-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; X64-NEXT: movd %xmm5, %ecx -; X64-NEXT: movd %xmm6, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; X64-NEXT: movd %xmm5, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm5 +; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm5 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; X64-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; X64-NEXT: psrad $24, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; X64-NEXT: psrad $24, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: movd %xmm6, %eax ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; X64-NEXT: movd %xmm6, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm6 +; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm6 ; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; X64-NEXT: movd %xmm5, %ecx -; X64-NEXT: movd %xmm4, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; X64-NEXT: movd %xmm4, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm4 +; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; X64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; X64-NEXT: psrad $24, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %xmm4, %eax ; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; X64-NEXT: psrad $24, %xmm6 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: movd %xmm7, %eax ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; X64-NEXT: movd %xmm7, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm7 +; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm7 ; X64-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; X64-NEXT: movd %xmm5, %ecx -; X64-NEXT: movd %xmm6, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; X64-NEXT: movd %xmm5, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm5 +; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm5 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; X64-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; X64-NEXT: psrad $24, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] -; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; X64-NEXT: psrad $24, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: movd %xmm6, %eax ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; X64-NEXT: movd %xmm6, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm6 +; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm6 ; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; X64-NEXT: movd %xmm5, %ecx -; X64-NEXT: movd %xmm4, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; X64-NEXT: movd %xmm4, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %eax, %esi -; X64-NEXT: movd %esi, %xmm4 +; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movsbl %cl, %eax +; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; X64-NEXT: retq @@ -978,202 +898,132 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl $16, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl -; X86-NEXT: setg %dl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB16_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %dl, %cl -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: .LBB16_2: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jl .LBB16_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, %edi -; X86-NEXT: .LBB16_4: +; X86-NEXT: setl %al +; X86-NEXT: setg %bh +; X86-NEXT: subb %al, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %ebx -; X86-NEXT: jl .LBB16_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: .LBB16_6: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jl .LBB16_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: .LBB16_8: +; X86-NEXT: setl %al +; X86-NEXT: setg %bl +; X86-NEXT: subb %al, %bl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_10: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_12: +; X86-NEXT: setl %al +; X86-NEXT: setg %dh +; X86-NEXT: subb %al, %dh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_14: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_16 -; X86-NEXT: # %bb.15: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_16: +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_18 -; X86-NEXT: # %bb.17: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_18: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, (%esp) # 4-byte Folded Spill -; X86-NEXT: jl .LBB16_20 -; X86-NEXT: # %bb.19: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: .LBB16_20: -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %ebx -; X86-NEXT: jl .LBB16_22 -; X86-NEXT: # %bb.21: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: .LBB16_22: -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jl .LBB16_24 -; X86-NEXT: # %bb.23: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB16_24: -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: setl %al ; X86-NEXT: setg %ah -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jl .LBB16_26 -; X86-NEXT: # %bb.25: -; X86-NEXT: movb %ah, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB16_26: -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subb %al, %ah +; X86-NEXT: movsbl %ah, %esi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB16_28 -; X86-NEXT: # %bb.27: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: .LBB16_28: +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jl .LBB16_30 -; X86-NEXT: # %bb.29: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB16_30: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch -; X86-NEXT: setg %cl -; X86-NEXT: jl .LBB16_32 -; X86-NEXT: # %bb.31: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_32: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 60(%eax) -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl %esi, 52(%eax) -; X86-NEXT: movl %edx, 48(%eax) -; X86-NEXT: movl %ebp, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: movl %esi, 56(%eax) +; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl %ebp, 48(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 40(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: movl %edx, 36(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: movsbl %bl, %esi +; X86-NEXT: movl %esi, 32(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movsbl %bh, %edi +; X86-NEXT: movl %edi, 28(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X86-NEXT: movl %ebx, 24(%eax) +; X86-NEXT: movl %edi, 20(%eax) +; X86-NEXT: movl %esi, 16(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $48, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1187,121 +1037,136 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X64-LABEL: scmp_wide_vec_op: ; X64: # %bb.0: ; X64-NEXT: movq %xmm7, %rax -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al ; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovll %eax, %ecx -; X64-NEXT: movd %ecx, %xmm8 +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm8 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; X64-NEXT: movq %xmm7, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm7 +; X64-NEXT: movq %xmm7, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; X64-NEXT: movq %xmm6, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm7 +; X64-NEXT: movq %xmm6, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; X64-NEXT: movq %xmm6, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: movq %xmm6, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; X64-NEXT: movq %xmm5, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm6 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X64-NEXT: movq %xmm5, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm5 +; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm5 +; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm4 +; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; X64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; X64-NEXT: movq %xmm3, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm4 +; X64-NEXT: movq %xmm3, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X64-NEXT: movq %xmm3, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm3 +; X64-NEXT: movq %xmm3, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm3 +; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: movq %xmm1, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: movq %xmm1, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X64-NEXT: movq %xmm0, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rcx -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: setg %dl -; X64-NEXT: cmovll %eax, %edx -; X64-NEXT: movd %edx, %xmm0 +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: cmpq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: setl %al +; X64-NEXT: setg %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -1315,41 +1180,76 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: setl %al +; X86-NEXT: cmpl %edi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: setl %ah +; X86-NEXT: subb %al, %ah +; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %ecx, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sbbl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: setl %al +; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setl %ah +; X86-NEXT: subb %al, %ah +; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %edi, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: setl %al +; X86-NEXT: cmpl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setl %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %ebp, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmpl %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, %bh -; X86-NEXT: jl .LBB17_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %bl, %bh -; X86-NEXT: .LBB17_2: -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: setl %cl +; X86-NEXT: subb %bl, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %edi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_4: +; X86-NEXT: setl %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: movl %esi, %edi ; X86-NEXT: sbbl %ecx, %edi @@ -1359,215 +1259,138 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_6: ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: setl %bl ; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_8: -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: setl %bl -; X86-NEXT: cmpl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_10: -; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_12: -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: setl %bl -; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: setl %dl +; X86-NEXT: subb %bl, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_14: -; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_16 -; X86-NEXT: # %bb.15: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_16: -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: setl %bl -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %dl +; X86-NEXT: subb %bl, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_18 -; X86-NEXT: # %bb.17: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_18: -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: setl %al +; X86-NEXT: cmpl %ecx, %ebp +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: setl %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_20 -; X86-NEXT: # %bb.19: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_20: -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: setl %bl -; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: setl %al +; X86-NEXT: cmpl %ecx, %ebp +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: setl %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, (%esp) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jl .LBB17_22 -; X86-NEXT: # %bb.21: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_22: -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jl .LBB17_24 -; X86-NEXT: # %bb.23: -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: .LBB17_24: -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: setl %dl +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: sbbl %edi, %esi ; X86-NEXT: setl %ch -; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: subb %dl, %ch +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: setl %cl +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: setl %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jl .LBB17_26 -; X86-NEXT: # %bb.25: -; X86-NEXT: movb %ch, %cl -; X86-NEXT: .LBB17_26: -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl %edi, %esi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: setl %dh -; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: cmpl %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB17_28 -; X86-NEXT: # %bb.27: -; X86-NEXT: movb %dh, %al -; X86-NEXT: .LBB17_28: -; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %ebx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: setl %dh +; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: setl %cl -; X86-NEXT: cmpl %ebx, %edi +; X86-NEXT: subb %dh, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movb $-1, %dh -; X86-NEXT: jl .LBB17_30 -; X86-NEXT: # %bb.29: -; X86-NEXT: movb %cl, %dh -; X86-NEXT: .LBB17_30: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %ebx, %ecx -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: setl %dl -; X86-NEXT: cmpl %ecx, %ebx -; X86-NEXT: sbbl %ebp, %edi -; X86-NEXT: movb $-1, %bl -; X86-NEXT: jl .LBB17_32 -; X86-NEXT: # %bb.31: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: .LBB17_32: +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: setl %dh +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: setl %bl +; X86-NEXT: subb %dh, %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb %bl, 15(%eax) -; X86-NEXT: movb %dh, 14(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 13(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, 12(%eax) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 14(%eax) +; X86-NEXT: movb %dl, 13(%eax) +; X86-NEXT: movb %ch, 12(%eax) +; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NEXT: movb %cl, 11(%eax) ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movb %cl, 10(%eax) @@ -1591,7 +1414,7 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X86-NEXT: movb %cl, 1(%eax) ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $16, %esp +; X86-NEXT: addl $12, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1607,111 +1430,158 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X64-NEXT: pushq %rbp ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; X64-NEXT: addb %r11b, %r11b -; X64-NEXT: sarb %r11b -; X64-NEXT: addb %dl, %dl -; X64-NEXT: sarb %dl -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpb %r11b, %dl -; X64-NEXT: setg %dil -; X64-NEXT: movq $-1, %r11 -; X64-NEXT: cmovlq %r11, %rdi -; X64-NEXT: addb %r12b, %r12b -; X64-NEXT: sarb %r12b -; X64-NEXT: addb %cl, %cl -; X64-NEXT: sarb %cl -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpb %r12b, %cl -; X64-NEXT: setg %dl -; X64-NEXT: cmovlq %r11, %rdx ; X64-NEXT: addb %r15b, %r15b ; X64-NEXT: sarb %r15b -; X64-NEXT: addb %r8b, %r8b -; X64-NEXT: sarb %r8b -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpb %r15b, %r8b -; X64-NEXT: setg %cl -; X64-NEXT: cmovlq %r11, %rcx +; X64-NEXT: addb %sil, %sil +; X64-NEXT: sarb %sil +; X64-NEXT: cmpb %r15b, %sil +; X64-NEXT: setl %sil +; X64-NEXT: setg %r15b +; X64-NEXT: subb %sil, %r15b +; X64-NEXT: movsbq %r15b, %rsi +; X64-NEXT: movq %rsi, (%rax) +; X64-NEXT: movq %rsi, %xmm0 +; X64-NEXT: sarq $63, %rsi ; X64-NEXT: addb %r14b, %r14b ; X64-NEXT: sarb %r14b -; X64-NEXT: addb %r9b, %r9b -; X64-NEXT: sarb %r9b -; X64-NEXT: xorl %r8d, %r8d -; X64-NEXT: cmpb %r14b, %r9b -; X64-NEXT: setg %r8b -; X64-NEXT: cmovlq %r11, %r8 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; X64-NEXT: addb %r15b, %r15b +; X64-NEXT: sarb %r15b +; X64-NEXT: cmpb %r14b, %r15b +; X64-NEXT: setl %r14b +; X64-NEXT: setg %r15b +; X64-NEXT: subb %r14b, %r15b +; X64-NEXT: movsbq %r15b, %r14 +; X64-NEXT: movq %r14, %r15 +; X64-NEXT: sarq $63, %r15 ; X64-NEXT: addb %bpl, %bpl ; X64-NEXT: sarb %bpl -; X64-NEXT: addb %sil, %sil -; X64-NEXT: sarb %sil -; X64-NEXT: xorl %r9d, %r9d -; X64-NEXT: cmpb %bpl, %sil -; X64-NEXT: setg %r9b -; X64-NEXT: cmovlq %r11, %r9 +; X64-NEXT: addb %dl, %dl +; X64-NEXT: sarb %dl +; X64-NEXT: cmpb %bpl, %dl +; X64-NEXT: setl %dl +; X64-NEXT: setg %bpl +; X64-NEXT: subb %dl, %bpl +; X64-NEXT: movsbq %bpl, %rdx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 ; X64-NEXT: addb %bl, %bl ; X64-NEXT: sarb %bl +; X64-NEXT: addb %cl, %cl +; X64-NEXT: sarb %cl +; X64-NEXT: cmpb %bl, %cl +; X64-NEXT: setl %cl +; X64-NEXT: setg %bl +; X64-NEXT: subb %cl, %bl +; X64-NEXT: movsbq %bl, %rbx +; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: addb %r11b, %r11b +; X64-NEXT: sarb %r11b +; X64-NEXT: addb %r8b, %r8b +; X64-NEXT: sarb %r8b +; X64-NEXT: cmpb %r11b, %r8b +; X64-NEXT: setl %r8b +; X64-NEXT: setg %r11b +; X64-NEXT: subb %r8b, %r11b +; X64-NEXT: movsbq %r11b, %r8 +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: sarq $63, %r11 +; X64-NEXT: addb %r10b, %r10b +; X64-NEXT: sarb %r10b +; X64-NEXT: addb %r9b, %r9b +; X64-NEXT: sarb %r9b +; X64-NEXT: cmpb %r10b, %r9b +; X64-NEXT: setl %r9b +; X64-NEXT: setg %r10b +; X64-NEXT: subb %r9b, %r10b +; X64-NEXT: movsbq %r10b, %r9 +; X64-NEXT: movq %r9, %r10 +; X64-NEXT: sarq $63, %r10 +; X64-NEXT: addb %dil, %dil +; X64-NEXT: sarb %dil ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; X64-NEXT: addb %bpl, %bpl ; X64-NEXT: sarb %bpl -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpb %bl, %bpl -; X64-NEXT: setg %sil -; X64-NEXT: cmovlq %r11, %rsi -; X64-NEXT: addb %r10b, %r10b -; X64-NEXT: sarb %r10b -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; X64-NEXT: addb %bl, %bl -; X64-NEXT: sarb %bl -; X64-NEXT: xorl %r14d, %r14d -; X64-NEXT: cmpb %r10b, %bl -; X64-NEXT: setg %r14b -; X64-NEXT: cmovlq %r11, %r14 -; X64-NEXT: movq %r14, %r10 -; X64-NEXT: shrq $2, %r10 -; X64-NEXT: movq %r10, 88(%rax) -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: shlq $9, %r10 +; X64-NEXT: cmpb %dil, %bpl +; X64-NEXT: setl %dil +; X64-NEXT: setg %bpl +; X64-NEXT: subb %dil, %bpl +; X64-NEXT: movsbq %bpl, %r13 +; X64-NEXT: movq %r13, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: shldq $62, %r13, %rdi +; X64-NEXT: movq %rdi, 88(%rax) +; X64-NEXT: shrq $2, %rbp +; X64-NEXT: movl %ebp, 96(%rax) +; X64-NEXT: movq %r10, %rdi +; X64-NEXT: shldq $20, %r9, %rdi +; X64-NEXT: movq %rdi, 64(%rax) +; X64-NEXT: movq %r11, %rdi +; X64-NEXT: shldq $31, %r8, %rdi +; X64-NEXT: movq %rdi, 48(%rax) +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: shldq $42, %rbx, %rdi +; X64-NEXT: movq %rdi, 32(%rax) +; X64-NEXT: movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800 +; X64-NEXT: andq %r12, %rdi +; X64-NEXT: shldq $53, %rdx, %r12 +; X64-NEXT: movq %r12, 16(%rax) +; X64-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF +; X64-NEXT: andq %r12, %r15 +; X64-NEXT: shldq $9, %r14, %r15 +; X64-NEXT: shlq $62, %r13 +; X64-NEXT: orq %r15, %r13 +; X64-NEXT: movq %r13, 80(%rax) +; X64-NEXT: movabsq $2251799813685247, %r15 # imm = 0x7FFFFFFFFFFFF +; X64-NEXT: andq %rbp, %r15 +; X64-NEXT: movq %r15, %r13 +; X64-NEXT: shrq $48, %r13 +; X64-NEXT: movb %r13b, 102(%rax) +; X64-NEXT: shrq $32, %r15 +; X64-NEXT: movw %r15w, 100(%rax) +; X64-NEXT: shlq $42, %rbx +; X64-NEXT: shrq $11, %rdi +; X64-NEXT: orq %rbx, %rdi +; X64-NEXT: movq %rdi, 24(%rax) +; X64-NEXT: shlq $9, %r14 +; X64-NEXT: shrq $44, %r10 +; X64-NEXT: andl $511, %r10d # imm = 0x1FF +; X64-NEXT: orq %r14, %r10 ; X64-NEXT: movq %r10, 72(%rax) -; X64-NEXT: movq %r9, (%rax) -; X64-NEXT: shlq $62, %r14 -; X64-NEXT: shrq $55, %rsi -; X64-NEXT: orq %r14, %rsi -; X64-NEXT: movq %rsi, 80(%rax) -; X64-NEXT: movq %r8, %rsi -; X64-NEXT: shrq $44, %rsi -; X64-NEXT: movq %rsi, 64(%rax) -; X64-NEXT: shlq $20, %r8 -; X64-NEXT: movq %r8, 56(%rax) -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: shrq $33, %rsi -; X64-NEXT: movq %rsi, 48(%rax) -; X64-NEXT: shlq $31, %rcx -; X64-NEXT: movq %rcx, 40(%rax) -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: shlq $20, %r9 +; X64-NEXT: shrq $33, %r11 +; X64-NEXT: andl $1048575, %r11d # imm = 0xFFFFF +; X64-NEXT: orq %r9, %r11 +; X64-NEXT: movq %r11, 56(%rax) +; X64-NEXT: shlq $31, %r8 ; X64-NEXT: shrq $22, %rcx -; X64-NEXT: movq %rcx, 32(%rax) -; X64-NEXT: shlq $42, %rdx -; X64-NEXT: movq %rdx, 24(%rax) -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: shrq $11, %rcx -; X64-NEXT: movq %rcx, 16(%rax) -; X64-NEXT: shlq $53, %rdi -; X64-NEXT: movq %rdi, 8(%rax) -; X64-NEXT: movb $0, 102(%rax) -; X64-NEXT: movw $0, 100(%rax) -; X64-NEXT: movl $0, 96(%rax) +; X64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: orq %r8, %rcx +; X64-NEXT: movq %rcx, 40(%rax) +; X64-NEXT: movq %rsi, %xmm1 +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: movq %xmm0, %rcx +; X64-NEXT: andq %r12, %rcx +; X64-NEXT: shlq $53, %rdx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: movq %rdx, 8(%rax) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: popq %rbp @@ -1723,203 +1593,200 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $44, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb %al, (%esp) # 1-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addb %cl, %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: addb %dh, %dh -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: sarb %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: addb %ch, %ch -; X86-NEXT: sarb %ch +; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: addb %ah, %ah -; X86-NEXT: sarb %ah -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb %al, %ah -; X86-NEXT: setg %al -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $-1, %esi -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jl .LBB18_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: .LBB18_2: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movb {{[0-9]+}}(%esp), %bh -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addb %al, %al -; X86-NEXT: movb %al, (%esp) # 1-byte Spill +; X86-NEXT: sarb %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb %dl, %dl +; X86-NEXT: sarb %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: addb %ah, %ah +; X86-NEXT: sarb %ah +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb %cl, %cl ; X86-NEXT: sarb %cl -; X86-NEXT: sarb %dh -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb %dl, %ch -; X86-NEXT: setg %dl -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB18_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %dl, %al -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: .LBB18_4: ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: addb %bl, %bl -; X86-NEXT: addb %bh, %bh -; X86-NEXT: sarb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: sarb (%esp) # 1-byte Folded Spill -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb %cl, %dh -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB18_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %al -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: .LBB18_6: -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh ; X86-NEXT: addb %ch, %ch -; X86-NEXT: addb %dl, %dl +; X86-NEXT: sarb %ch +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addb %bl, %bl ; X86-NEXT: sarb %bl -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: addb %bh, %bh ; X86-NEXT: sarb %bh -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movb (%esp), %bl # 1-byte Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload -; X86-NEXT: setg %bl -; X86-NEXT: movl $-1, %esi -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jl .LBB18_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %bl, %al -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: .LBB18_8: -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addb %cl, %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh ; X86-NEXT: addb %dh, %dh -; X86-NEXT: sarb %ch -; X86-NEXT: sarb %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; X86-NEXT: setg %bl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jl .LBB18_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movb %bl, %al -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: .LBB18_10: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarb %cl ; X86-NEXT: sarb %dh -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb %ch, %dl +; X86-NEXT: cmpb %al, %dh +; X86-NEXT: setl %al +; X86-NEXT: setg %dh +; X86-NEXT: subb %al, %dh +; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: setl %al +; X86-NEXT: setg %dh +; X86-NEXT: subb %al, %dh +; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %ch +; X86-NEXT: setl %al +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ecx, (%ebp) +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %dl, %ah +; X86-NEXT: setl %al ; X86-NEXT: setg %dl -; X86-NEXT: movl $-1, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jl .LBB18_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movb %dl, %bl -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB18_12: +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb %cl, %dh -; X86-NEXT: setg %cl -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jl .LBB18_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB18_14: -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: setl %dl +; X86-NEXT: setg %dh +; X86-NEXT: subb %dl, %dh +; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 96(%ebp) +; X86-NEXT: movl %edx, 92(%ebp) +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 80(%ebp) +; X86-NEXT: movl %eax, 68(%ebp) +; X86-NEXT: movl %eax, 64(%ebp) +; X86-NEXT: movl %esi, 52(%ebp) +; X86-NEXT: movl %esi, 48(%ebp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 36(%ebp) +; X86-NEXT: movl %edi, 24(%ebp) +; X86-NEXT: movl %edi, 20(%ebp) +; X86-NEXT: movl %ecx, 8(%ebp) +; X86-NEXT: movl %ecx, 4(%ebp) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $30, %edx, %ecx +; X86-NEXT: movl %ecx, 88(%ebp) +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $9, %ebp, %ecx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl %ecx, 76(%ebx) +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $20, %ebx, %ecx +; X86-NEXT: movl %ecx, 60(%ebp) ; X86-NEXT: movl %esi, %ecx -; X86-NEXT: shrl $2, %ecx -; X86-NEXT: movl %ecx, 92(%eax) -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: shrl $23, %ecx -; X86-NEXT: movl %ecx, 80(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: shrl $12, %ecx -; X86-NEXT: movl %ecx, 64(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl %ecx -; X86-NEXT: movl %ecx, 48(%eax) +; X86-NEXT: shldl $31, %ebx, %ecx +; X86-NEXT: movl %ecx, 44(%ebp) +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrl $22, %ecx -; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $10, %ebp, %ecx +; X86-NEXT: movl %ecx, 32(%ebx) +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $21, %ebp, %ecx +; X86-NEXT: movl %ecx, 16(%ebx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shrl $11, %ecx -; X86-NEXT: movl %ecx, 20(%eax) +; X86-NEXT: shrl $2, %ecx +; X86-NEXT: movw %cx, 100(%ebx) +; X86-NEXT: shll $21, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, 12(%ebx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: shldl $30, %ecx, %esi -; X86-NEXT: movl %esi, 88(%eax) ; X86-NEXT: shll $30, %ecx -; X86-NEXT: movl %ecx, 84(%eax) +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, 84(%ebx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $9, %ecx, %ebp -; X86-NEXT: movl %ebp, 76(%eax) ; X86-NEXT: shll $9, %ecx -; X86-NEXT: movl %ecx, 72(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $20, %ecx, %edi -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: shll $20, %ecx -; X86-NEXT: movl %ecx, 56(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $31, %ecx, %ebx -; X86-NEXT: movl %ebx, 44(%eax) -; X86-NEXT: shll $31, %ecx -; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $10, %ecx, %esi -; X86-NEXT: movl %esi, 32(%eax) -; X86-NEXT: shll $10, %ecx -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $21, %ecx, %edx -; X86-NEXT: movl %edx, 16(%eax) -; X86-NEXT: shll $21, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movb $0, 102(%eax) -; X86-NEXT: movw $0, 100(%eax) -; X86-NEXT: movl $0, 96(%eax) -; X86-NEXT: movl $0, 68(%eax) -; X86-NEXT: movl $0, 52(%eax) -; X86-NEXT: movl $0, 24(%eax) -; X86-NEXT: movl $0, 8(%eax) -; X86-NEXT: addl $44, %esp +; X86-NEXT: shrl $12, %eax +; X86-NEXT: andl $511, %eax # imm = 0x1FF +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, 72(%ebx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $20, %eax +; X86-NEXT: shrl %esi +; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, 56(%ebx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $31, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, 40(%ebx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $10, %eax +; X86-NEXT: shrl $11, %edi +; X86-NEXT: andl $1023, %edi # imm = 0x3FF +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, 28(%ebx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $18, %eax +; X86-NEXT: andl $7, %eax +; X86-NEXT: movb %al, 102(%ebx) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1936,39 +1803,36 @@ define <1 x i3> @scmp_scalarize(<1 x i33> %x, <1 x i33> %y) nounwind { ; X64-NEXT: sarq $31, %rsi ; X64-NEXT: shlq $31, %rdi ; X64-NEXT: sarq $31, %rdi -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: setg %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: setl %cl +; X64-NEXT: setg %al +; X64-NEXT: subb %cl, %al ; X64-NEXT: retq ; ; X86-LABEL: scmp_scalarize: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: andl $1, %edi -; X86-NEXT: negl %edi -; X86-NEXT: cmpl %ecx, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: setl %dl -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB19_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: .LBB19_2: +; X86-NEXT: andl $1, %esi +; X86-NEXT: negl %esi +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: setl %al +; X86-NEXT: subb %bl, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call <1 x i3> @llvm.scmp(<1 x i33> %x, <1 x i33> %y) ret <1 x i3> %1 @@ -1981,29 +1845,29 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind { ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: andb $1, %cl -; X64-NEXT: negb %cl -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; X64-NEXT: andb $1, %sil -; X64-NEXT: negb %sil -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpb %cl, %sil -; X64-NEXT: setg %dil -; X64-NEXT: movl $255, %ecx -; X64-NEXT: cmovll %ecx, %edi -; X64-NEXT: shll $8, %edi ; X64-NEXT: andb $1, %al ; X64-NEXT: negb %al +; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; X64-NEXT: andb $1, %dl ; X64-NEXT: negb %dl -; X64-NEXT: xorl %esi, %esi ; X64-NEXT: cmpb %al, %dl -; X64-NEXT: setg %sil -; X64-NEXT: cmovll %ecx, %esi -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: orl %edi, %eax -; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: setl %al +; X64-NEXT: setg %dl +; X64-NEXT: subb %al, %dl +; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: andb $1, %cl +; X64-NEXT: negb %cl +; X64-NEXT: andb $1, %sil +; X64-NEXT: negb %sil +; X64-NEXT: cmpb %cl, %sil +; X64-NEXT: setl %cl +; X64-NEXT: setg %dl +; X64-NEXT: subb %cl, %dl +; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: shll $8, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: scmp_bool_operands: @@ -2011,29 +1875,23 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $1, %cl ; X86-NEXT: negb %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: andb $1, %ah -; X86-NEXT: negb %ah -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: andb $1, %al -; X86-NEXT: negb %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: andb $1, %dl ; X86-NEXT: negb %dl -; X86-NEXT: cmpb %al, %dl -; X86-NEXT: setg %ch -; X86-NEXT: movb $-1, %dl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jl .LBB20_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %ch, %al -; X86-NEXT: .LBB20_2: -; X86-NEXT: cmpb %cl, %ah -; X86-NEXT: setg %cl -; X86-NEXT: jl .LBB20_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: .LBB20_4: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andb $1, %al +; X86-NEXT: negb %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: andb $1, %ah +; X86-NEXT: negb %ah +; X86-NEXT: cmpb %al, %ah +; X86-NEXT: setl %ah +; X86-NEXT: setg %al +; X86-NEXT: subb %ah, %al +; X86-NEXT: cmpb %cl, %dl +; X86-NEXT: setl %cl +; X86-NEXT: setg %dl +; X86-NEXT: subb %cl, %dl ; X86-NEXT: retl %1 = call <2 x i8> @llvm.scmp(<2 x i1> %x, <2 x i1> %y) ret <2 x i8> %1 @@ -2048,45 +1906,36 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin ; X64-NEXT: movd %xmm0, %edx ; X64-NEXT: movl %edx, %esi ; X64-NEXT: shrl $8, %esi -; X64-NEXT: xorl %edi, %edi ; X64-NEXT: cmpb %cl, %sil -; X64-NEXT: setg %dil -; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X64-NEXT: cmovll %ecx, %edi -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpb %al, %dl +; X64-NEXT: setl %cl ; X64-NEXT: setg %sil -; X64-NEXT: cmovll %ecx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: pinsrw $1, %edi, %xmm0 +; X64-NEXT: subb %cl, %sil +; X64-NEXT: movsbl %sil, %ecx +; X64-NEXT: cmpb %al, %dl +; X64-NEXT: setl %al +; X64-NEXT: setg %dl +; X64-NEXT: subb %al, %dl +; X64-NEXT: movsbl %dl, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: pinsrw $1, %ecx, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: scmp_ret_wider_than_operands: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setg %ch -; X86-NEXT: movl $65535, %edx # imm = 0xFFFF -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: jl .LBB21_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %ch, %bl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: .LBB21_2: -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: setg %cl -; X86-NEXT: jl .LBB21_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB21_4: +; X86-NEXT: setl %cl +; X86-NEXT: setg %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: # kill: def $dx killed $dx killed $edx -; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call <2 x i16> @llvm.scmp(<2 x i8> %x, <2 x i8> %y) ret <2 x i16> %1 diff --git a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll deleted file mode 100644 index 1cbd61567f3270..00000000000000 --- a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll +++ /dev/null @@ -1,387 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s - -define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pavgusb: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pf2id(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pf2id: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pf2id {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pf2iw(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pf2iw: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pf2iw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfacc: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfadd: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfadd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfcmpeq: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpeq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfcmpge: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpge {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfcmpgt: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpgt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfmax: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmax {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfmin: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmin {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfmul: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmul {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfnacc: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfpnacc: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfpnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfrcp(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pfrcp: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrcp {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfrcpit1: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrcpit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfrcpit2: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrcpit2 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfrsqit1: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrsqit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pfrsqrt: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrsqrt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfsub: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfsub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pfsubr: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfsubr {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pi2fd(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pi2fd: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pi2fd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pi2fw(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pi2fw: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pi2fw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) { -; CHECK-LABEL: stack_fold_pmulhrw: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhrw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone - -define x86_mmx @stack_fold_pswapd(x86_mmx %a) { -; CHECK-LABEL: stack_fold_pswapd: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: #APP -; CHECK-NEXT: nop -; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pswapd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mem[1,0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 -; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone - ret x86_mmx %2 -} -declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-twoaddr.mir b/llvm/test/CodeGen/X86/statepoint-vreg-twoaddr.mir index 9e29739812d32b..665ca956bc32eb 100644 --- a/llvm/test/CodeGen/X86/statepoint-vreg-twoaddr.mir +++ b/llvm/test/CodeGen/X86/statepoint-vreg-twoaddr.mir @@ -1,4 +1,5 @@ # RUN: llc -x mir -run-pass=twoaddressinstruction < %s | FileCheck %s +# RUN: llc -x mir --passes=two-address-instruction < %s | FileCheck %s # This test checks that TwoAddressInstruction pass does not create redundate COPY # instruction for STATEPOINT tied operands. diff --git a/llvm/test/CodeGen/X86/twoaddr-mul2.mir b/llvm/test/CodeGen/X86/twoaddr-mul2.mir index 5aa9613e162eba..e21005fa92397d 100644 --- a/llvm/test/CodeGen/X86/twoaddr-mul2.mir +++ b/llvm/test/CodeGen/X86/twoaddr-mul2.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=x86_64-unknown -mcpu=haswell -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=x86_64-unknown -mcpu=haswell --passes=two-address-instruction -verify-machineinstrs %s -o - | FileCheck %s # Check that we don't have any uses of [[COPY]] after it is killed. --- diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 344404749d7ef3..ac35605be4d52e 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -5,24 +5,17 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind { ; X64-LABEL: ucmp.8.8: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpb %sil, %dil -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp.8.8: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB0_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB0_2: +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) ret i8 %1 @@ -31,24 +24,17 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind { define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind { ; X64-LABEL: ucmp.8.16: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpw %si, %di -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp.8.16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB1_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB1_2: +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) ret i8 %1 @@ -57,24 +43,17 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind { define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { ; X64-LABEL: ucmp.8.32: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp.8.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB2_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB2_2: +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) ret i8 %1 @@ -83,33 +62,26 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind { define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: ucmp.8.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp.8.64: ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: setb %al +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB3_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB3_2: +; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl @@ -124,12 +96,9 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: sbbq %rsi, %rax ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: cmpq %rdx, %rdi ; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %r8d, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp.8.128: @@ -138,30 +107,26 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %ebp, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB4_2 -; X86-NEXT: # %bb.1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB4_2: +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -174,25 +139,19 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { ; X64-LABEL: ucmp.32.32: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: seta %cl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq ; ; X86-LABEL: ucmp.32.32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %dl -; X86-NEXT: movl $-1, %eax -; X86-NEXT: jb .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %dl, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB5_2: +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: retl %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) ret i32 %1 @@ -201,34 +160,32 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind { define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: ucmp.32.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: seta %cl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax ; X64-NEXT: retq ; ; X86-LABEL: ucmp.32.64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: jb .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: .LBB6_2: +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movsbl %bl, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) ret i32 %1 @@ -237,36 +194,34 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind { define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { ; X64-LABEL: ucmp.64.64: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: seta %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbq %al, %rax ; X64-NEXT: retq ; ; X86-LABEL: ucmp.64.64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jb .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB7_2: +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movsbl %bl, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) ret i64 %1 @@ -275,24 +230,17 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind { define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind { ; X64-LABEL: ucmp_narrow_result: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp_narrow_result: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB8_2: +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al ; X86-NEXT: retl %1 = call i4 @llvm.ucmp(i32 %x, i32 %y) ret i4 %1 @@ -304,35 +252,28 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind { ; X64-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF ; X64-NEXT: andq %rax, %rsi ; X64-NEXT: andq %rax, %rdi -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: seta %cl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp_narrow_op: ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl $1073741823, %eax # imm = 0x3FFFFFFF +; X86-NEXT: movl $1073741823, %ecx # imm = 0x3FFFFFFF ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl %eax, %edx -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %edi, %esi -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB9_2 -; X86-NEXT: # %bb.1: ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB9_2: +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: setb %al +; X86-NEXT: cmpl %edi, %esi +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl @@ -343,39 +284,31 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind { define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind { ; X64-LABEL: ucmp_wide_result: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl %esi, %edi -; X64-NEXT: seta %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovaeq %rcx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbq %al, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: andl $8191, %ecx # imm = 0x1FFF ; X64-NEXT: retq ; ; X86-LABEL: ucmp_wide_result: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %bl -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jb .LBB10_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %bl, %dl -; X86-NEXT: movl %edx, %esi -; X86-NEXT: .LBB10_2: -; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: seta %cl +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl $0, 12(%eax) -; X86-NEXT: movl $0, 8(%eax) -; X86-NEXT: movw $0, 16(%eax) -; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx +; X86-NEXT: andl $8191, %ecx # imm = 0x1FFF +; X86-NEXT: movw %cx, 16(%eax) ; X86-NEXT: retl $4 %1 = call i141 @llvm.ucmp(i32 %x, i32 %y) ret i141 %1 @@ -391,12 +324,9 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind { ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: sbbq %rsi, %rax ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: cmpq %rdx, %rdi ; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovael %r8d, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: sbbb $0, %al ; X64-NEXT: retq ; ; X86-LABEL: ucmp_wide_op: @@ -405,33 +335,27 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl $8191, %eax # imm = 0x1FFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl %eax, %ecx -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $8191, %ecx # imm = 0x1FFF ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: setb %al ; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB11_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: .LBB11_2: -; X86-NEXT: addl $4, %esp +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbb $0, %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -446,32 +370,24 @@ define i41 @ucmp_uncommon_types(i7 %x, i7 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: andb $127, %sil ; X64-NEXT: andb $127, %dil -; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpb %sil, %dil -; X64-NEXT: seta %cl -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbq %al, %rax ; X64-NEXT: retq ; ; X86-LABEL: ucmp_uncommon_types: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andb $127, %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: andb $127, %ah -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb %al, %ah -; X86-NEXT: seta %bl -; X86-NEXT: movl $-1, %eax -; X86-NEXT: jb .LBB12_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %bl, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: .LBB12_2: -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: popl %ebx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andb $127, %cl +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: retl %1 = call i41 @llvm.ucmp(i7 %x, i7 %y) ret i41 %1 @@ -484,38 +400,37 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movd %xmm2, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %dl -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovbl %eax, %edx -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X64-NEXT: movdqa %xmm2, %xmm0 @@ -523,59 +438,37 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X86-LABEL: ucmp_normal_vectors: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %edx -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jb .LBB13_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB13_2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jb .LBB13_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: .LBB13_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: seta %dl +; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jb .LBB13_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB13_6: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movsbl %bl, %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movsbl %bl, %esi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %cl -; X86-NEXT: jb .LBB13_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB13_8: -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %1 = call <4 x i32> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %1 @@ -586,45 +479,41 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %dl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovbl %eax, %edx -; X64-NEXT: movzbl %dl, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: movd %xmm2, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm2, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %edx, %esi -; X64-NEXT: seta %dil -; X64-NEXT: cmovbl %eax, %edi -; X64-NEXT: movzbl %dil, %edx -; X64-NEXT: shll $8, %edx -; X64-NEXT: orl %ecx, %edx +; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: cmpl %ecx, %edx +; X64-NEXT: seta %cl +; X64-NEXT: sbbb $0, %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shll $8, %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: movd %xmm2, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm2, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %ecx, %esi -; X64-NEXT: seta %dil -; X64-NEXT: cmovbl %eax, %edi -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %edx, %ecx +; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: cmpl %eax, %edx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: shll $16, %eax +; X64-NEXT: orl %ecx, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm1, %edx +; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm0, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %edx, %esi -; X64-NEXT: seta %dil -; X64-NEXT: cmovbl %eax, %edi -; X64-NEXT: shll $24, %edi -; X64-NEXT: orl %ecx, %edi -; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: movd %xmm0, %edx +; X64-NEXT: cmpl %ecx, %edx +; X64-NEXT: seta %cl +; X64-NEXT: sbbb $0, %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shll $24, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: ucmp_narrow_vec_result: @@ -633,40 +522,24 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: seta %cl +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: seta %ch -; X86-NEXT: movb $-1, %dl -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jb .LBB14_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %ch, %cl -; X86-NEXT: .LBB14_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB14_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %ch -; X86-NEXT: .LBB14_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbb $0, %ch ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: seta %bl -; X86-NEXT: movb $-1, %dh -; X86-NEXT: jb .LBB14_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %bl, %dh -; X86-NEXT: .LBB14_6: -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: seta %bl -; X86-NEXT: jb .LBB14_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %bl, %dl -; X86-NEXT: .LBB14_8: +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: seta %dl +; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movb %dl, 3(%eax) -; X86-NEXT: movb %dh, 2(%eax) +; X86-NEXT: movb %bl, 2(%eax) ; X86-NEXT: movb %ch, 1(%eax) ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: popl %esi @@ -682,105 +555,82 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pextrw $0, %xmm1, %ecx +; X64-NEXT: pextrw $0, %xmm1, %eax ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm3, %eax +; X64-NEXT: movd %xmm3, %ecx ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; X64-NEXT: pextrw $0, %xmm0, %edx ; X64-NEXT: movdqa %xmm0, %xmm3 ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; X64-NEXT: movd %xmm0, %esi -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %eax, %esi -; X64-NEXT: seta %dil -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovbl %eax, %edi +; X64-NEXT: cmpl %ecx, %esi +; X64-NEXT: seta %cl +; X64-NEXT: sbbb $0, %cl ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm0, %esi ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; X64-NEXT: movd %xmm0, %r8d -; X64-NEXT: xorl %r9d, %r9d -; X64-NEXT: cmpl %esi, %r8d -; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: seta %r9b -; X64-NEXT: cmovbl %eax, %r9d -; X64-NEXT: movd %r9d, %xmm2 +; X64-NEXT: movd %xmm0, %edi +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: movsbl %cl, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: seta %cl +; X64-NEXT: sbbb $0, %cl +; X64-NEXT: movsbl %cl, %ecx +; X64-NEXT: movd %ecx, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: cmpl %eax, %edx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq ; ; X86-LABEL: ucmp_narrow_vec_op: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %edx -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jb .LBB15_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB15_2: ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: seta %dl +; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movsbl %dl, %edx +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl ; X86-NEXT: seta %bl -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jb .LBB15_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %bl, %al -; X86-NEXT: movl %eax, %esi -; X86-NEXT: .LBB15_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movsbl %bl, %esi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jb .LBB15_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB15_6: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: seta %ch +; X86-NEXT: sbbb $0, %ch +; X86-NEXT: movsbl %ch, %edi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl ; X86-NEXT: seta %cl -; X86-NEXT: jb .LBB15_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB15_8: -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %1 = call <4 x i32> @llvm.ucmp(<4 x i8> %x, <4 x i8> %y) ret <4 x i32> %1 @@ -798,178 +648,175 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: movdqa %xmm1, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: pextrw $0, %xmm4, %edi +; X64-NEXT: pextrw $0, %xmm4, %edx ; X64-NEXT: movdqa %xmm4, %xmm3 -; X64-NEXT: pextrw $4, %xmm4, %r11d +; X64-NEXT: pextrw $4, %xmm4, %r9d ; X64-NEXT: movdqa %xmm4, %xmm5 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] ; X64-NEXT: movd %xmm4, %eax ; X64-NEXT: movdqa %xmm0, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; X64-NEXT: pextrw $0, %xmm6, %r8d +; X64-NEXT: pextrw $0, %xmm6, %esi ; X64-NEXT: movdqa %xmm6, %xmm4 -; X64-NEXT: pextrw $4, %xmm6, %ebx +; X64-NEXT: pextrw $4, %xmm6, %r10d ; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] ; X64-NEXT: movd %xmm7, %ecx -; X64-NEXT: xorl %esi, %esi ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %sil -; X64-NEXT: movl $-1, %edx -; X64-NEXT: cmovbl %edx, %esi -; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm7, %esi +; X64-NEXT: movd %xmm7, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; X64-NEXT: movd %xmm7, %r9d -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %esi, %r9d -; X64-NEXT: seta %al -; X64-NEXT: cmovbl %edx, %eax +; X64-NEXT: movd %xmm7, %edi +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: seta %cl +; X64-NEXT: sbbb $0, %cl +; X64-NEXT: movsbl %cl, %eax ; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %edi, %r8d -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %edx, %esi +; X64-NEXT: cmpl %edx, %esi +; X64-NEXT: seta %dl +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movsbl %dl, %edx ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %r8d +; X64-NEXT: movd %xmm5, %esi ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; X64-NEXT: movd %xmm5, %r9d -; X64-NEXT: xorl %edi, %edi -; X64-NEXT: cmpl %r8d, %r9d -; X64-NEXT: seta %dil -; X64-NEXT: cmovbl %edx, %edi +; X64-NEXT: movd %xmm5, %edi +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: seta %sil +; X64-NEXT: sbbb $0, %sil +; X64-NEXT: movsbl %sil, %esi ; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] -; X64-NEXT: movd %xmm5, %r9d +; X64-NEXT: movd %xmm5, %edi ; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] -; X64-NEXT: movd %xmm5, %r10d -; X64-NEXT: xorl %r8d, %r8d -; X64-NEXT: cmpl %r9d, %r10d -; X64-NEXT: seta %r8b -; X64-NEXT: cmovbl %edx, %r8d +; X64-NEXT: movd %xmm5, %r8d +; X64-NEXT: cmpl %edi, %r8d +; X64-NEXT: seta %dil +; X64-NEXT: sbbb $0, %dil +; X64-NEXT: movsbl %dil, %edi ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; X64-NEXT: movd %xmm5, %r10d +; X64-NEXT: movd %xmm5, %r8d ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; X64-NEXT: movd %xmm5, %ebp -; X64-NEXT: xorl %r9d, %r9d -; X64-NEXT: cmpl %r10d, %ebp +; X64-NEXT: movd %xmm5, %r11d +; X64-NEXT: cmpl %r8d, %r11d +; X64-NEXT: seta %r8b +; X64-NEXT: sbbb $0, %r8b +; X64-NEXT: movsbl %r8b, %r8d +; X64-NEXT: cmpl %r9d, %r10d ; X64-NEXT: seta %r9b -; X64-NEXT: cmovbl %edx, %r9d -; X64-NEXT: xorl %r10d, %r10d -; X64-NEXT: cmpl %r11d, %ebx -; X64-NEXT: seta %r10b -; X64-NEXT: cmovbl %edx, %r10d +; X64-NEXT: sbbb $0, %r9b +; X64-NEXT: movsbl %r9b, %r9d ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X64-NEXT: movd %xmm3, %ebx +; X64-NEXT: movd %xmm3, %r10d ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] -; X64-NEXT: movd %xmm3, %ebp -; X64-NEXT: xorl %r11d, %r11d -; X64-NEXT: cmpl %ebx, %ebp -; X64-NEXT: seta %r11b -; X64-NEXT: cmovbl %edx, %r11d +; X64-NEXT: movd %xmm3, %r11d +; X64-NEXT: cmpl %r10d, %r11d +; X64-NEXT: seta %r10b +; X64-NEXT: sbbb $0, %r10b +; X64-NEXT: movsbl %r10b, %r10d ; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X64-NEXT: pextrw $0, %xmm1, %r15d +; X64-NEXT: pextrw $0, %xmm1, %ebx ; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] -; X64-NEXT: movd %xmm5, %ebp +; X64-NEXT: pextrw $4, %xmm1, %r11d +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; X64-NEXT: movd %xmm3, %r14d ; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X64-NEXT: pextrw $0, %xmm0, %r12d +; X64-NEXT: pextrw $0, %xmm0, %r15d ; X64-NEXT: movdqa %xmm0, %xmm5 -; X64-NEXT: movdqa %xmm0, %xmm6 -; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] -; X64-NEXT: movd %xmm7, %r14d -; X64-NEXT: xorl %ebx, %ebx -; X64-NEXT: cmpl %ebp, %r14d -; X64-NEXT: seta %bl -; X64-NEXT: cmovbl %edx, %ebx -; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; X64-NEXT: movd %xmm7, %r14d -; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; X64-NEXT: movd %xmm7, %r13d -; X64-NEXT: xorl %ebp, %ebp -; X64-NEXT: cmpl %r14d, %r13d -; X64-NEXT: seta %bpl -; X64-NEXT: cmovbl %edx, %ebp -; X64-NEXT: xorl %r14d, %r14d -; X64-NEXT: cmpl %r15d, %r12d +; X64-NEXT: pextrw $4, %xmm0, %ebp +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; X64-NEXT: movd %xmm3, %r12d +; X64-NEXT: cmpl %r14d, %r12d ; X64-NEXT: seta %r14b -; X64-NEXT: cmovbl %edx, %r14d -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; X64-NEXT: sbbb $0, %r14b +; X64-NEXT: movsbl %r14b, %r14d +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X64-NEXT: movd %xmm3, %r12d -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm3, %r13d -; X64-NEXT: xorl %r15d, %r15d ; X64-NEXT: cmpl %r12d, %r13d +; X64-NEXT: seta %r12b +; X64-NEXT: sbbb $0, %r12b +; X64-NEXT: cmpl %ebx, %r15d +; X64-NEXT: seta %bl +; X64-NEXT: sbbb $0, %bl +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm1, %r15d +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm0, %r13d +; X64-NEXT: cmpl %r15d, %r13d ; X64-NEXT: seta %r15b -; X64-NEXT: cmovbl %edx, %r15d +; X64-NEXT: sbbb $0, %r15b ; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; X64-NEXT: movd %xmm3, %r13d +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; X64-NEXT: movd %xmm0, %r13d ; X64-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: xorl %r12d, %r12d +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: cmpl %r13d, %eax -; X64-NEXT: seta %r12b -; X64-NEXT: cmovbl %edx, %r12d -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: xorl %r13d, %r13d -; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; X64-NEXT: movd %xmm0, %r13d +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %r13d, %ecx +; X64-NEXT: movsbl %r12b, %ecx +; X64-NEXT: movsbl %bl, %ebx +; X64-NEXT: movsbl %r15b, %r15d ; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload ; X64-NEXT: # xmm2 = mem[0],zero,zero,zero -; X64-NEXT: pextrw $4, %xmm1, %eax ; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload ; X64-NEXT: # xmm3 = mem[0],zero,zero,zero -; X64-NEXT: pextrw $4, %xmm0, %ecx -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: movd %edi, %xmm6 -; X64-NEXT: movd %r8d, %xmm7 -; X64-NEXT: movd %r9d, %xmm8 -; X64-NEXT: movd %r10d, %xmm1 -; X64-NEXT: movd %r11d, %xmm9 +; X64-NEXT: movd %edx, %xmm0 +; X64-NEXT: movd %esi, %xmm6 +; X64-NEXT: movd %edi, %xmm7 +; X64-NEXT: movd %r8d, %xmm8 +; X64-NEXT: movd %r9d, %xmm1 +; X64-NEXT: movd %r10d, %xmm9 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %ebx, %xmm10 +; X64-NEXT: movd %r14d, %xmm10 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; X64-NEXT: movd %ebp, %xmm6 +; X64-NEXT: movd %ecx, %xmm6 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; X64-NEXT: movd %r14d, %xmm2 +; X64-NEXT: movd %ebx, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; X64-NEXT: movd %r15d, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; X64-NEXT: movd %r12d, %xmm7 ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm7 +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm8 ; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; X64-NEXT: seta %r13b -; X64-NEXT: cmovbl %edx, %r13d -; X64-NEXT: movd %r13d, %xmm6 -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %edx, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; X64-NEXT: cmpl %r11d, %ebp +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] ; X64-NEXT: movd %xmm4, %eax ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] ; X64-NEXT: movd %xmm4, %ecx -; X64-NEXT: xorl %esi, %esi ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %edx, %esi -; X64-NEXT: movd %esi, %xmm4 +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movsbl %al, %eax +; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -984,202 +831,115 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl $12, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: seta %cl +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl -; X86-NEXT: seta %dl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jb .LBB16_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movb %dl, %cl -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: .LBB16_2: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jb .LBB16_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, %edi -; X86-NEXT: .LBB16_4: +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, (%esp) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: seta %bh +; X86-NEXT: sbbb $0, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, %ebx -; X86-NEXT: jb .LBB16_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: .LBB16_6: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jb .LBB16_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: .LBB16_8: +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_10: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_12: +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_14: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_16 -; X86-NEXT: # %bb.15: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_16: +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_18 -; X86-NEXT: # %bb.17: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_18: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, (%esp) # 4-byte Folded Spill -; X86-NEXT: jb .LBB16_20 -; X86-NEXT: # %bb.19: -; X86-NEXT: movb %al, %dl -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: .LBB16_20: -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, %ebx -; X86-NEXT: jb .LBB16_22 -; X86-NEXT: # %bb.21: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: .LBB16_22: -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %ebp -; X86-NEXT: jb .LBB16_24 -; X86-NEXT: # %bb.23: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: .LBB16_24: -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl -; X86-NEXT: seta %ah -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jb .LBB16_26 -; X86-NEXT: # %bb.25: -; X86-NEXT: movb %ah, %bl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: .LBB16_26: -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al -; X86-NEXT: movl $-1, %esi -; X86-NEXT: jb .LBB16_28 -; X86-NEXT: # %bb.27: -; X86-NEXT: movb %al, %bl -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: .LBB16_28: +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movsbl %al, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl -; X86-NEXT: seta %cl -; X86-NEXT: movl $-1, %edi -; X86-NEXT: jb .LBB16_30 -; X86-NEXT: # %bb.29: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: .LBB16_30: -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch -; X86-NEXT: seta %cl -; X86-NEXT: jb .LBB16_32 -; X86-NEXT: # %bb.31: -; X86-NEXT: movb %cl, %bl -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: .LBB16_32: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 60(%eax) -; X86-NEXT: movl %edi, 56(%eax) +; X86-NEXT: movl %edx, 56(%eax) ; X86-NEXT: movl %esi, 52(%eax) -; X86-NEXT: movl %edx, 48(%eax) -; X86-NEXT: movl %ebp, 44(%eax) +; X86-NEXT: movl %ebp, 48(%eax) +; X86-NEXT: movl %edi, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movsbl %bh, %ecx ; X86-NEXT: movl %ecx, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movsbl (%esp), %edx # 1-byte Folded Reload +; X86-NEXT: movl %edx, 32(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: movsbl %bl, %edi +; X86-NEXT: movl %edi, 28(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X86-NEXT: movl %ebx, 24(%eax) +; X86-NEXT: movl %edi, 20(%eax) +; X86-NEXT: movl %edx, 16(%eax) ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $48, %esp +; X86-NEXT: addl $12, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1196,150 +956,149 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; X64-NEXT: movd %xmm8, %eax ; X64-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] ; X64-NEXT: movd %xmm8, %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: seta %dl -; X64-NEXT: movl $255, %eax -; X64-NEXT: cmovbl %eax, %edx -; X64-NEXT: movd %edx, %xmm8 +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm8 ; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; X64-NEXT: movd %xmm9, %ecx +; X64-NEXT: movd %xmm9, %eax ; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; X64-NEXT: movd %xmm9, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm9 +; X64-NEXT: movd %xmm9, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm9 ; X64-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; X64-NEXT: movd %xmm7, %ecx -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm8 +; X64-NEXT: movd %xmm7, %eax +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm8 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: movd %xmm7, %eax ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] -; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: movd %xmm3, %eax ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: movd %xmm7, %eax ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; X64-NEXT: movd %xmm7, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm7 +; X64-NEXT: movd %xmm7, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; X64-NEXT: movd %xmm6, %ecx -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: movd %xmm6, %eax +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: movd %xmm6, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] -; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: movd %xmm2, %eax ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm2, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: movd %xmm6, %eax ; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm6, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm6 +; X64-NEXT: movd %xmm6, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; X64-NEXT: movd %xmm5, %ecx -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: movd %xmm5, %eax ; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm5, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm5 +; X64-NEXT: movd %xmm5, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; X64-NEXT: movd %xmm4, %ecx -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %xmm4, %eax +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %xmm4, %eax ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpl %ecx, %edx -; X64-NEXT: seta %sil -; X64-NEXT: cmovbl %eax, %esi -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %xmm0, %ecx +; X64-NEXT: cmpl %eax, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbb $0, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1349,155 +1108,91 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; ; X86-LABEL: ucmp_wide_vec_op: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movb $-1, %dl -; X86-NEXT: jb .LBB17_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: .LBB17_2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB17_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB17_4: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB17_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movb %cl, %ch -; X86-NEXT: .LBB17_6: -; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB17_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB17_8: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB17_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movb %cl, %ch -; X86-NEXT: .LBB17_10: -; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB17_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB17_12: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB17_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movb %cl, %ch -; X86-NEXT: .LBB17_14: -; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: seta %al +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB17_16 -; X86-NEXT: # %bb.15: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB17_16: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB17_18 -; X86-NEXT: # %bb.17: -; X86-NEXT: movb %cl, %ch -; X86-NEXT: .LBB17_18: -; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: seta %bh +; X86-NEXT: sbbb $0, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB17_20 -; X86-NEXT: # %bb.19: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB17_20: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: seta %cl -; X86-NEXT: movb $-1, %bh -; X86-NEXT: jb .LBB17_22 -; X86-NEXT: # %bb.21: -; X86-NEXT: movb %cl, %bh -; X86-NEXT: .LBB17_22: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jb .LBB17_24 -; X86-NEXT: # %bb.23: -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: .LBB17_24: +; X86-NEXT: seta %dh +; X86-NEXT: sbbb $0, %dh ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %ch -; X86-NEXT: movb $-1, %dl -; X86-NEXT: jb .LBB17_26 -; X86-NEXT: # %bb.25: -; X86-NEXT: movb %ch, %dl -; X86-NEXT: .LBB17_26: -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbb $0, %ch +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: seta %al -; X86-NEXT: movb $-1, %ch -; X86-NEXT: jb .LBB17_28 -; X86-NEXT: # %bb.27: -; X86-NEXT: movb %al, %ch -; X86-NEXT: .LBB17_28: +; X86-NEXT: seta %dl +; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %bl -; X86-NEXT: movb $-1, %dh -; X86-NEXT: jb .LBB17_30 -; X86-NEXT: # %bb.29: -; X86-NEXT: movb %bl, %dh -; X86-NEXT: .LBB17_30: -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: seta %bl -; X86-NEXT: jb .LBB17_32 -; X86-NEXT: # %bb.31: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB17_32: -; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Reload -; X86-NEXT: movb %bl, 15(%eax) -; X86-NEXT: movb %dh, 14(%eax) +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: seta %cl +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) ; X86-NEXT: movb %ch, 13(%eax) -; X86-NEXT: movb %dl, 12(%eax) -; X86-NEXT: movb %cl, 11(%eax) +; X86-NEXT: movb %dh, 12(%eax) +; X86-NEXT: movb %bl, 11(%eax) ; X86-NEXT: movb %bh, 10(%eax) ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movb %cl, 9(%eax) @@ -1523,6 +1218,7 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %1 = call <16 x i8> @llvm.ucmp(<16 x i32> %x, <16 x i32> %y) ret <16 x i8> %1 @@ -1600,70 +1296,66 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: andl $127, %eax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; X64-NEXT: andl $127, %r13d +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: andl $127, %r12d ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: andl $127, %eax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; X64-NEXT: andl $127, %r15d +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; X64-NEXT: andl $127, %r14d ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: andl $127, %eax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; X64-NEXT: andl $127, %ebx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: andl $127, %r12d +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; X64-NEXT: andl $127, %r15d ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; X64-NEXT: andl $127, %ebp ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: andl $127, %r11d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; X64-NEXT: andl $127, %r8d +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; X64-NEXT: andl $127, %r13d ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: andl $127, %r10d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-NEXT: andl $127, %edx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: andl $127, %esi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; X64-NEXT: andl $127, %r14d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: andl $127, %ecx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: cmpq %r9, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: setb %al -; X64-NEXT: cmpq %rdi, %r9 -; X64-NEXT: sbbq %rcx, %r14 -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movl $255, %r14d -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: andl $127, %edi +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: andl $127, %eax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: andl $127, %edx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: cmpq %r9, %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: sbbq %rax, %rcx +; X64-NEXT: setb %cl +; X64-NEXT: cmpq %r8, %r9 +; X64-NEXT: sbbq %rdx, %rax +; X64-NEXT: sbbb $0, %cl +; X64-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: sbbq %rdx, %rdi -; X64-NEXT: setb %dil -; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: movq %rdi, %rdx ; X64-NEXT: sbbq %rsi, %rdx -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: setb %dl +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: sbbq %rdi, %rsi +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: movq %r10, %rdx -; X64-NEXT: sbbq %r8, %rdx +; X64-NEXT: sbbq %r13, %rdx ; X64-NEXT: setb %dl ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: sbbq %r10, %r8 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: sbbq %r10, %r13 +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: cmpq %rax, %rcx @@ -1672,184 +1364,179 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X64-NEXT: setb %dl ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: sbbq %r11, %rbp -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: movq %r12, %rdx +; X64-NEXT: movq %r15, %rdx ; X64-NEXT: sbbq %rbx, %rdx ; X64-NEXT: setb %dl ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: sbbq %r12, %rbx -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: sbbq %r15, %rdx -; X64-NEXT: setb %dl -; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: sbbq %rsi, %r15 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: sbbq %r15, %rbx +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: sbbq %r13, %rdx +; X64-NEXT: sbbq %r14, %rdx ; X64-NEXT: setb %dl ; X64-NEXT: cmpq %rcx, %rax -; X64-NEXT: sbbq %rsi, %r13 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovbl %r14d, %eax -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: sbbq %rsi, %r14 +; X64-NEXT: sbbb $0, %dl +; X64-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %rsi -; X64-NEXT: setb %sil +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: sbbq %r12, %rax +; X64-NEXT: setb %r13b ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: sbbq %rdi, %rax -; X64-NEXT: movzbl %sil, %ebp -; X64-NEXT: cmovbl %r14d, %ebp +; X64-NEXT: sbbq %rsi, %r12 +; X64-NEXT: sbbb $0, %r13b ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: cmpq %rdx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %rdi -; X64-NEXT: setb %dil +; X64-NEXT: sbbq %rax, %rcx +; X64-NEXT: setb %bpl ; X64-NEXT: cmpq %rsi, %rdx -; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movzbl %dil, %ebx -; X64-NEXT: cmovbl %r14d, %ebx +; X64-NEXT: sbbq %rdi, %rax +; X64-NEXT: sbbb $0, %bpl ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: cmpq %rsi, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r8 -; X64-NEXT: setb %r8b +; X64-NEXT: sbbq %rax, %rdx +; X64-NEXT: setb %r11b ; X64-NEXT: cmpq %rdi, %rsi ; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movzbl %r8b, %r10d -; X64-NEXT: cmovbl %r14d, %r10d +; X64-NEXT: sbbb $0, %r11b ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; X64-NEXT: cmpq %rdi, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rcx, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r9 -; X64-NEXT: setb %r9b +; X64-NEXT: sbbq %rax, %rsi +; X64-NEXT: setb %sil ; X64-NEXT: cmpq %r8, %rdi ; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movzbl %r9b, %r8d -; X64-NEXT: cmovbl %r14d, %r8d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: sbbb $0, %sil +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: cmpq %r8, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: sbbq %rax, %rdi +; X64-NEXT: setb %dil +; X64-NEXT: cmpq %r9, %r8 +; X64-NEXT: sbbq %rcx, %rax +; X64-NEXT: sbbb $0, %dil ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: cmpq %rdi, %r9 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: cmpq %r9, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq (%rsp), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r11 -; X64-NEXT: setb %r11b -; X64-NEXT: cmpq %r9, %rdi +; X64-NEXT: sbbq %rax, %r8 +; X64-NEXT: setb %r8b +; X64-NEXT: cmpq %r10, %r9 ; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movzbl %r11b, %r9d -; X64-NEXT: cmovbl %r14d, %r9d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; X64-NEXT: cmpq %rdi, %r11 +; X64-NEXT: sbbb $0, %r8b +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; X64-NEXT: cmpq %r10, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rcx, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r15 -; X64-NEXT: setb %r15b -; X64-NEXT: cmpq %r11, %rdi +; X64-NEXT: sbbq %rax, %r9 +; X64-NEXT: setb %r9b +; X64-NEXT: cmpq %rbx, %r10 ; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movzbl %r15b, %r11d -; X64-NEXT: cmovbl %r14d, %r11d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: sbbb $0, %r9b +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: cmpq %rax, %rdi +; X64-NEXT: cmpq %rax, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: sbbq %rcx, %r15 -; X64-NEXT: setb %r15b -; X64-NEXT: cmpq %rdi, %rax +; X64-NEXT: sbbq %rcx, %r10 +; X64-NEXT: setb %r10b +; X64-NEXT: cmpq %rbx, %rax ; X64-NEXT: sbbq %rdx, %rcx -; X64-NEXT: movzbl %r15b, %edi -; X64-NEXT: cmovbl %r14d, %edi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; X64-NEXT: sbbb $0, %r10b +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: cmpq %rcx, %r15 +; X64-NEXT: cmpq %rcx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r12 -; X64-NEXT: setb %r12b -; X64-NEXT: cmpq %r15, %rcx +; X64-NEXT: sbbq %rax, %rbx +; X64-NEXT: setb %bl +; X64-NEXT: cmpq %r14, %rcx ; X64-NEXT: sbbq %rdx, %rax -; X64-NEXT: movzbl %r12b, %r15d -; X64-NEXT: cmovbl %r14d, %r15d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; X64-NEXT: sbbb $0, %bl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r13 -; X64-NEXT: setb %r13b -; X64-NEXT: cmpq %r12, %rcx +; X64-NEXT: sbbq %rax, %r14 +; X64-NEXT: setb %r14b +; X64-NEXT: cmpq %r15, %rcx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; X64-NEXT: sbbq %rdx, %rax -; X64-NEXT: movzbl %r13b, %r12d -; X64-NEXT: cmovbl %r14d, %r12d -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-NEXT: cmpq %rsi, %rdx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: sbbb $0, %r14b +; X64-NEXT: cmpq %r12, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: sbbq %rax, %r13 -; X64-NEXT: setb %r13b -; X64-NEXT: cmpq %rdx, %rsi -; X64-NEXT: sbbq %rcx, %rax -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload -; X64-NEXT: # xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload -; X64-NEXT: # xmm2 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload -; X64-NEXT: # xmm3 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload -; X64-NEXT: # xmm4 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload -; X64-NEXT: # xmm5 = mem[0],zero,zero,zero -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload -; X64-NEXT: # xmm6 = mem[0],zero,zero,zero -; X64-NEXT: movd %ebp, %xmm7 -; X64-NEXT: movd %ebx, %xmm8 -; X64-NEXT: movd %r10d, %xmm9 -; X64-NEXT: movd %r8d, %xmm10 -; X64-NEXT: movd %r9d, %xmm11 -; X64-NEXT: movd %r11d, %xmm12 -; X64-NEXT: movd %edi, %xmm13 -; X64-NEXT: movd %r15d, %xmm14 +; X64-NEXT: sbbq %rcx, %r15 +; X64-NEXT: setb %r15b +; X64-NEXT: cmpq %rax, %r12 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm1 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: movzbl %r13b, %eax +; X64-NEXT: movd %eax, %xmm6 +; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: movd %eax, %xmm7 +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: movd %eax, %xmm8 +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movd %eax, %xmm9 +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movd %eax, %xmm10 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movd %eax, %xmm11 +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movd %eax, %xmm12 +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: movd %eax, %xmm13 +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: movd %eax, %xmm14 +; X64-NEXT: movzbl %r14b, %eax +; X64-NEXT: movd %eax, %xmm15 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] @@ -1861,17 +1548,17 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X64-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; X64-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; X64-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; X64-NEXT: movd %r12d, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; X64-NEXT: movzbl %r13b, %eax -; X64-NEXT: cmovbl %r14d, %eax +; X64-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; X64-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; X64-NEXT: sbbq %rdx, %rcx +; X64-NEXT: sbbb $0, %r15b +; X64-NEXT: movzbl %r15b, %eax ; X64-NEXT: andl $3, %eax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movb %al, 4(%rdi) -; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: andl $3, %eax ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx @@ -1950,502 +1637,471 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $44, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $127, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: andl $127, %edi +; X86-NEXT: subl $132, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $127, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: andl $127, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: movl %esi, %edx -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movb $-1, %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: jb .LBB18_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: .LBB18_2: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edx -; X86-NEXT: andl $127, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jb .LBB18_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB18_4: ; X86-NEXT: andl $127, %eax -; X86-NEXT: andl $127, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %ebx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %ebp, %edi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jb .LBB18_6 -; X86-NEXT: # %bb.5: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB18_6: -; X86-NEXT: andl $127, %ecx -; X86-NEXT: andl $127, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl $127, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $127, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $127, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: andl $127, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %esi ; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: movb $-1, %bl -; X86-NEXT: jb .LBB18_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: .LBB18_8: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edx -; X86-NEXT: andl $127, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jb .LBB18_10 -; X86-NEXT: # %bb.9: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB18_10: -; X86-NEXT: andl $127, %eax -; X86-NEXT: andl $127, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %ebp, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %edi ; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jb .LBB18_12 -; X86-NEXT: # %bb.11: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB18_12: -; X86-NEXT: andl $127, %ecx -; X86-NEXT: andl $127, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %edi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: movb $-1, %bl -; X86-NEXT: jb .LBB18_14 -; X86-NEXT: # %bb.13: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: .LBB18_14: -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edx -; X86-NEXT: andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: setb %bl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: movb $-1, %bh -; X86-NEXT: jb .LBB18_16 -; X86-NEXT: # %bb.15: -; X86-NEXT: movb %bl, %bh -; X86-NEXT: .LBB18_16: -; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %eax -; X86-NEXT: andl $127, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %ebx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: movb $-1, %dl -; X86-NEXT: jb .LBB18_18 -; X86-NEXT: # %bb.17: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: .LBB18_18: -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %ecx -; X86-NEXT: andl $127, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %ebx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebp, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: movb $-1, %dh -; X86-NEXT: jb .LBB18_20 -; X86-NEXT: # %bb.19: -; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload -; X86-NEXT: .LBB18_20: -; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edi -; X86-NEXT: andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: setb %cl +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %eax, %ebp +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: setb %cl +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jb .LBB18_22 -; X86-NEXT: # %bb.21: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: .LBB18_22: +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: setb %cl +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %eax -; X86-NEXT: andl $127, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: jb .LBB18_24 -; X86-NEXT: # %bb.23: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: .LBB18_24: -; X86-NEXT: andl $127, %ebp -; X86-NEXT: andl $127, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: setb %dl +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %ebp, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movb $-1, %al -; X86-NEXT: jb .LBB18_26 -; X86-NEXT: # %bb.25: -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: .LBB18_26: -; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edi -; X86-NEXT: andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: sbbl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %ebp, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: setb %al -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl $0, %ebp ; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: movb $-1, %ah -; X86-NEXT: jb .LBB18_28 -; X86-NEXT: # %bb.27: -; X86-NEXT: movb %al, %ah -; X86-NEXT: .LBB18_28: -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %edx -; X86-NEXT: andl $127, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %ebx, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: setb %al -; X86-NEXT: cmpl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jb .LBB18_30 -; X86-NEXT: # %bb.29: -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: .LBB18_30: +; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: andl $127, %ebp -; X86-NEXT: andl $127, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: sbbl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sbbl %edx, %eax ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: setb %al -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setb %bl +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sbbb $0, %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movb $-1, %cl -; X86-NEXT: jb .LBB18_32 -; X86-NEXT: # %bb.31: -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: .LBB18_32: -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: setb %bh +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %ecx, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sbbb $0, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %esi, %edi ; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: movl $0, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: setb %al -; X86-NEXT: cmpl %ecx, %esi -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: movb $-1, %dl -; X86-NEXT: jb .LBB18_34 -; X86-NEXT: # %bb.33: -; X86-NEXT: movl %eax, %edx -; X86-NEXT: .LBB18_34: -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: andl $3, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb %al, 4(%edx) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: sbbb $0, %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $3, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movb %cl, 4(%edi) +; X86-NEXT: movzbl %bh, %ebp +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: andl $3, %ebp ; X86-NEXT: andl $3, %ecx -; X86-NEXT: andl $3, %edi -; X86-NEXT: leal (%edi,%ecx,4), %eax -; X86-NEXT: andl $3, %esi -; X86-NEXT: shll $4, %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: leal (%ecx,%ebp,4), %ecx +; X86-NEXT: andl $3, %eax +; X86-NEXT: shll $4, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: andl $3, %ebx ; X86-NEXT: shll $6, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: andl $3, %ebp -; X86-NEXT: shll $8, %ebp -; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: andl $3, %esi +; X86-NEXT: shll $8, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: andl $3, %edx ; X86-NEXT: shll $10, %edx -; X86-NEXT: orl %ebp, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: andl $3, %eax ; X86-NEXT: shll $12, %eax @@ -2457,37 +2113,37 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: andl $3, %eax ; X86-NEXT: shll $16, %eax ; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X86-NEXT: andl $3, %edi -; X86-NEXT: shll $18, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: andl $3, %esi +; X86-NEXT: shll $18, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: andl $3, %eax +; X86-NEXT: shll $20, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X86-NEXT: andl $3, %ecx -; X86-NEXT: shll $20, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %edx, %eax +; X86-NEXT: andl $3, %ecx +; X86-NEXT: shll $22, %ecx ; X86-NEXT: andl $3, %esi -; X86-NEXT: shll $22, %esi -; X86-NEXT: andl $3, %edi -; X86-NEXT: shll $24, %edi -; X86-NEXT: orl %esi, %edi +; X86-NEXT: shll $24, %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: andl $3, %ebx ; X86-NEXT: shll $26, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: andl $3, %eax -; X86-NEXT: shll $28, %eax -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: andl $3, %ecx +; X86-NEXT: shll $28, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload ; X86-NEXT: shll $30, %edx -; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $44, %esp +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, (%edi) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl $132, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/MC/AMDGPU/expressions-gfx10.s b/llvm/test/MC/AMDGPU/expressions-gfx10.s index fc029a3d8e5191..efc8dc9479884b 100644 --- a/llvm/test/MC/AMDGPU/expressions-gfx10.s +++ b/llvm/test/MC/AMDGPU/expressions-gfx10.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck %s --check-prefix=GFX10 -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=NOGFX10 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck %s --check-prefix=GFX10 +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=NOGFX10 --implicit-check-not=error: %s i1=1 diff --git a/llvm/test/MC/AMDGPU/gfx1013.s b/llvm/test/MC/AMDGPU/gfx1013.s index a4180c3d93a23e..f8bad215fdc5af 100644 --- a/llvm/test/MC/AMDGPU/gfx1013.s +++ b/llvm/test/MC/AMDGPU/gfx1013.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck %s image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] // CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1030_unsupported.s b/llvm/test/MC/AMDGPU/gfx1030_unsupported.s index 9112a30b0b7bd0..da1fb512fec988 100644 --- a/llvm/test/MC/AMDGPU/gfx1030_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx1030_unsupported.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_mul_lo_i32 v0, v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_dpp16.s b/llvm/test/MC/AMDGPU/gfx10_asm_dpp16.s index 77df1f62b22432..d8987be832d4c6 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_mov_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 // GFX10: [0xfa,0x02,0x0a,0x7e,0x01,0x1b,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_dpp8.s b/llvm/test/MC/AMDGPU/gfx10_asm_dpp8.s index 260d06d2e4cb58..944e2b3526464c 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_mov_b32_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] // GFX10: encoding: [0xe9,0x02,0x0a,0x7e,0x01,0x88,0xc6,0xfa] diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_ds.s b/llvm/test/MC/AMDGPU/gfx10_asm_ds.s index abe3586e3d0ea7..ccdcd2262f9a40 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_ds.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_DS. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_err.s index ef12ba2a66b194..eee26196623d6e 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_err.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_err.s @@ -2,8 +2,8 @@ // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx701 %s 2>&1 | FileCheck --check-prefixes=GFX6-7,GFX6-8,GFX6-9 --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx801 %s 2>&1 | FileCheck --check-prefixes=GFX6-8,GFX6-9,GFX8-9 --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck --check-prefixes=GFX6-9,GFX8-9 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX10 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10 --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_DS. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_flat.s b/llvm/test/MC/AMDGPU/gfx10_asm_flat.s index 7340e48db3e84b..4a375c224b1228 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_flat.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_flat.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_FLAT. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s index b77f8e0a319270..95c22ef8ce283e 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_MUBUF. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s index 25c9e8f35093e2..b582de83a2f291 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_SMEM. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_sop.s b/llvm/test/MC/AMDGPU/gfx10_asm_sop.s index 8f1cde76c6aa3c..c35b04c20c8c37 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_sop.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_sop.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_SOP1. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s index 5a1673ef1ccef3..3cc25501ff7c03 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop1.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX10-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX10-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=GFX10-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX10-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOP1. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s index bf8e18ec145123..3dcf288bbbaa53 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOP2. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index b4256f1a628d5c..c151bf99b76c50 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W64-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOP3. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vopc.s b/llvm/test/MC/AMDGPU/gfx10_asm_vopc.s index 40618d1ea52355..8f42a478349616 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOPC - v_cmp_* opcodes. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vopc_e64.s b/llvm/test/MC/AMDGPU/gfx10_asm_vopc_e64.s index 669e79cdab10a6..f182de8fea984b 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vopc_e64.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vopc_e64.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOPC, VOP3 variant. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vopc_sdwa.s b/llvm/test/MC/AMDGPU/gfx10_asm_vopc_sdwa.s index 8808ab9ff2f9c8..50bd8c70ea5c01 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vopc_sdwa.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vopc_sdwa.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s //===----------------------------------------------------------------------===// // ENC_VOPC, SDWA variant. diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx10_asm_vopcx.s index eed284e56bbe40..e323603e3308ce 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX10 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s //===----------------------------------------------------------------------===// // ENC_VOPC - v_cmpx_* opcodes. diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s index 341ae5726c0efa..46b4e6ffb40376 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=CHECK,GFX1010 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=CHECK,GFX1010 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=CHECK,GFX1010 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=CHECK,GFX1010 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_dpp.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_dpp.s index 35c70fedb4661f..aa8f36d0da64ac 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_dpp.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_dpp.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_u32_dpp v255, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_e32.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_e32.s index cdcc6644824322..4d0c0a4a21b18c 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_e32.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_e32.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_u32_e32 v2, vcc, s0, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: e32 variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_e64.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_e64.s index 994a4c1b5f04ea..1b4f978c64971b 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_e64.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_e64.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_swap_b32_e64 v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: e64 variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_e64_dpp.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_e64_dpp.s index bfc8b9a64845c2..f60473c761fd40 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_e64_dpp.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_e64_dpp.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0] // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: e64_dpp variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s index b28c652335218e..88db110ad9c20a 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s index 67e7beaa262f47..c92fb45c79d026 100644 --- a/llvm/test/MC/AMDGPU/gfx11-promotions.s +++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1100 -mattr=+wavefrontsize32 %s | FileCheck --check-prefix=GFX11 %s // Check opcode promotions and forced suffices. // 1. When a suffix is optional, check that it may be omitted. diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_flat.s b/llvm/test/MC/AMDGPU/gfx11_asm_flat.s index 17deec42bc5fba..6add0c12d0944f 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_flat.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_flat.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s //===----------------------------------------------------------------------===// // FLAT. diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s index 28c38c99fd6799..c590d42c0dc663 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_floor_f16 v5, v1 // GFX11: encoding: [0x01,0xb7,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s index 993bfedb1b644f..d95ef6f15e48d1 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_bfrev_b32_e32 v5, v1 // GFX11: encoding: [0x01,0x71,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s index 10ea213a388c58..6cf3900dd3f0d9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_floor_f16 v5, v1 quad_perm:[3,2,1,0] // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s index 9342c57348c7fc..fea36e9f221445 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8-fake16.s index b49848f67dfee2..2b06f4597fd2fb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8-fake16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_floor_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s index 3aa926500f94fc..fc9079fc54282a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err-fake16.s index 826dd2a6109264..2cc67c23809b95 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err-fake16.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s v_floor_f16_e32 v128, 0xfe0b // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s index 7efaea0d92f466..5b5381b752febb 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s v_ceil_f16_e32 v128.l, 0xfe0b // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s index 601551efe00c2b..68de95a9857e81 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s v_ceil_f16 v128, 0xfe0b // GFX11: v_ceil_f16_e64 diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s index 2f71eaebb1e4f0..fb300b2e94972d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo // W32: encoding: [0x01,0x05,0x0a,0x40] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s index af090a270fbeb8..62c0deaecd96a5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s index 29dd341873f552..d235fcdeb526aa 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s index 12697dfe259fde..7c50b4c22fc294 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s v_add_f16_e32 v255, v1, v2 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s index 9c67b6499dd58b..a5b5f32e976226 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s v_add_f16 v255, v1, v2 // GFX11: v_add_f16_e64 diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s index c0ae7ecbdbdd86..e025ab73933eb6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add3_u32 v5, v1, v2, s3 // GFX11: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s index a8cc90ef6f8b58..ad1d652a2ac1df 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck -check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s v_cvt_pknorm_i16_f16 v5, v1, v2 // GFX11: v_cvt_pk_norm_i16_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s index 147d6c5d0789c9..58fec38cf57fbe 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W64-ERR --implicit-check-not=error: %s v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s index 1217a833ee38af..7d6ba3f9c4bcce 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s index 0314a2be79cc91..d105ac6a72bb64 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX11: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s index 3592679831d43b..f91faffe5655d2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 quad_perm:[3,2,1,0] // W32: [0x05,0x06,0x20,0xd5,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopc.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopc.s index 9a9a903085dd78..0473a86ffeb1f2 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s index 81ec15bb48f863..718d22469c580f 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s index 4c00148f7a8959..2fb95663a2f854 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR,W64-ERR --implicit-check-not=error: %s v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s index a3a07c9c5b6d42..665c99d697e00a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_ceil_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xdc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s index c91538168a320d..c7faf1124755e0 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s index 2ae47cf36b62df..69f6e795e1c386 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x06,0x20,0xd5,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopc.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopc.s index 83ae41d81df691..c99936903a4760 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s index 8c26c769a1962e..610261ad2a3034 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_err.s index 9fc17a6a27147b..1730f18dc9df50 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_err.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s v_permlane16_b32 v5, v1, s2, s3 op_sel:[0, 0, 0, 1] // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s index 71e0f82504151c..e6744a5082f2c7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s v_bfrev_b32_e64 v5, v1 // GFX11: encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s index 242c8a79fdd6f5..f6907520fbb030 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64 v5, s6, v1, 0xaf123456, s3 // W32: encoding: [0x05,0x06,0x20,0xd5,0x01,0xff,0x0d,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc.s index 72894715408ad0..d0e79c0aa3444b 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64 s5, v1, v2 // W32: encoding: [0x05,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s index 79409a568475a0..f8b65857a46f1e 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_e64 v1, v2 // GFX11: encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s index 3ff4ed27f1b256..10e9cbf3bc91c5 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1 // GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s index 3fb993dc8bec48..34b692d2956642 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] // GFX11: encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopc.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopc.s index f2025e4bd6d60a..5349362b8fbaf4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e32 vcc_lo, v1, v2 // W32: encoding: [0x01,0x05,0xfa,0x7c] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp16.s index 802562f38f44fd..1299d02c3c0a53 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_dpp vcc_lo, v1, v2 quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp8.s index e6ff8faa3aebc2..9f10a29791ad1c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s index b16caed8b275f7..75f20b0c7f0c4c 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopc_t16_promote.s @@ -12,13 +12,13 @@ v_cmp_class_f16 vcc, vcc_hi, v255 v_cmp_class_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v127, v255 +v_cmp_class_f16 vcc, v127, v255 // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, vcc_hi, v255 +v_cmp_class_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, vcc_lo, v255 +v_cmp_class_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v1, v255 @@ -33,16 +33,16 @@ v_cmp_eq_f16 vcc, vcc_hi, v255 v_cmp_eq_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v1, v255 +v_cmp_eq_f16 vcc, v1, v255 // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v127, v255 +v_cmp_eq_f16 vcc, v127, v255 // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, vcc_hi, v255 +v_cmp_eq_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, vcc_lo, v255 +v_cmp_eq_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v1, v255 @@ -57,16 +57,16 @@ v_cmp_eq_i16 vcc, vcc_hi, v255 v_cmp_eq_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v1, v255 +v_cmp_eq_i16 vcc, v1, v255 // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v127, v255 +v_cmp_eq_i16 vcc, v127, v255 // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, vcc_hi, v255 +v_cmp_eq_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, vcc_lo, v255 +v_cmp_eq_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v1, v255 @@ -81,16 +81,16 @@ v_cmp_eq_u16 vcc, vcc_hi, v255 v_cmp_eq_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v1, v255 +v_cmp_eq_u16 vcc, v1, v255 // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v127, v255 +v_cmp_eq_u16 vcc, v127, v255 // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, vcc_hi, v255 +v_cmp_eq_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, vcc_lo, v255 +v_cmp_eq_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v1, v255 @@ -105,16 +105,16 @@ v_cmp_f_f16 vcc, vcc_hi, v255 v_cmp_f_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v1, v255 +v_cmp_f_f16 vcc, v1, v255 // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v127, v255 +v_cmp_f_f16 vcc, v127, v255 // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, vcc_hi, v255 +v_cmp_f_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, vcc_lo, v255 +v_cmp_f_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v1, v255 @@ -129,16 +129,16 @@ v_cmp_ge_f16 vcc, vcc_hi, v255 v_cmp_ge_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v1, v255 +v_cmp_ge_f16 vcc, v1, v255 // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v127, v255 +v_cmp_ge_f16 vcc, v127, v255 // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, vcc_hi, v255 +v_cmp_ge_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, vcc_lo, v255 +v_cmp_ge_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v1, v255 @@ -153,16 +153,16 @@ v_cmp_ge_i16 vcc, vcc_hi, v255 v_cmp_ge_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v1, v255 +v_cmp_ge_i16 vcc, v1, v255 // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v127, v255 +v_cmp_ge_i16 vcc, v127, v255 // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, vcc_hi, v255 +v_cmp_ge_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, vcc_lo, v255 +v_cmp_ge_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v1, v255 @@ -177,16 +177,16 @@ v_cmp_ge_u16 vcc, vcc_hi, v255 v_cmp_ge_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v1, v255 +v_cmp_ge_u16 vcc, v1, v255 // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v127, v255 +v_cmp_ge_u16 vcc, v127, v255 // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, vcc_hi, v255 +v_cmp_ge_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, vcc_lo, v255 +v_cmp_ge_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v1, v255 @@ -201,16 +201,16 @@ v_cmp_gt_f16 vcc, vcc_hi, v255 v_cmp_gt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v1, v255 +v_cmp_gt_f16 vcc, v1, v255 // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v127, v255 +v_cmp_gt_f16 vcc, v127, v255 // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, vcc_hi, v255 +v_cmp_gt_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, vcc_lo, v255 +v_cmp_gt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v1, v255 @@ -225,16 +225,16 @@ v_cmp_gt_i16 vcc, vcc_hi, v255 v_cmp_gt_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v1, v255 +v_cmp_gt_i16 vcc, v1, v255 // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v127, v255 +v_cmp_gt_i16 vcc, v127, v255 // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, vcc_hi, v255 +v_cmp_gt_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, vcc_lo, v255 +v_cmp_gt_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v1, v255 @@ -249,16 +249,16 @@ v_cmp_gt_u16 vcc, vcc_hi, v255 v_cmp_gt_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v1, v255 +v_cmp_gt_u16 vcc, v1, v255 // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v127, v255 +v_cmp_gt_u16 vcc, v127, v255 // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, vcc_hi, v255 +v_cmp_gt_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, vcc_lo, v255 +v_cmp_gt_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v1, v255 @@ -273,16 +273,16 @@ v_cmp_le_f16 vcc, vcc_hi, v255 v_cmp_le_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v1, v255 +v_cmp_le_f16 vcc, v1, v255 // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v127, v255 +v_cmp_le_f16 vcc, v127, v255 // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, vcc_hi, v255 +v_cmp_le_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, vcc_lo, v255 +v_cmp_le_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v1, v255 @@ -297,16 +297,16 @@ v_cmp_le_i16 vcc, vcc_hi, v255 v_cmp_le_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v1, v255 +v_cmp_le_i16 vcc, v1, v255 // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v127, v255 +v_cmp_le_i16 vcc, v127, v255 // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, vcc_hi, v255 +v_cmp_le_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, vcc_lo, v255 +v_cmp_le_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v1, v255 @@ -321,16 +321,16 @@ v_cmp_le_u16 vcc, vcc_hi, v255 v_cmp_le_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v1, v255 +v_cmp_le_u16 vcc, v1, v255 // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v127, v255 +v_cmp_le_u16 vcc, v127, v255 // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, vcc_hi, v255 +v_cmp_le_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, vcc_lo, v255 +v_cmp_le_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v1, v255 @@ -345,16 +345,16 @@ v_cmp_lg_f16 vcc, vcc_hi, v255 v_cmp_lg_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v1, v255 +v_cmp_lg_f16 vcc, v1, v255 // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v127, v255 +v_cmp_lg_f16 vcc, v127, v255 // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, vcc_hi, v255 +v_cmp_lg_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, vcc_lo, v255 +v_cmp_lg_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v1, v255 @@ -369,16 +369,16 @@ v_cmp_lt_f16 vcc, vcc_hi, v255 v_cmp_lt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v1, v255 +v_cmp_lt_f16 vcc, v1, v255 // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v127, v255 +v_cmp_lt_f16 vcc, v127, v255 // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, vcc_hi, v255 +v_cmp_lt_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, vcc_lo, v255 +v_cmp_lt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v1, v255 @@ -393,16 +393,16 @@ v_cmp_lt_i16 vcc, vcc_hi, v255 v_cmp_lt_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v1, v255 +v_cmp_lt_i16 vcc, v1, v255 // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v127, v255 +v_cmp_lt_i16 vcc, v127, v255 // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, vcc_hi, v255 +v_cmp_lt_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, vcc_lo, v255 +v_cmp_lt_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v1, v255 @@ -417,16 +417,16 @@ v_cmp_lt_u16 vcc, vcc_hi, v255 v_cmp_lt_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v1, v255 +v_cmp_lt_u16 vcc, v1, v255 // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v127, v255 +v_cmp_lt_u16 vcc, v127, v255 // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, vcc_hi, v255 +v_cmp_lt_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, vcc_lo, v255 +v_cmp_lt_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v1, v255 @@ -441,16 +441,16 @@ v_cmp_ne_i16 vcc, vcc_hi, v255 v_cmp_ne_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v1, v255 +v_cmp_ne_i16 vcc, v1, v255 // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v127, v255 +v_cmp_ne_i16 vcc, v127, v255 // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, vcc_hi, v255 +v_cmp_ne_i16 vcc, vcc_hi, v255 // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, vcc_lo, v255 +v_cmp_ne_i16 vcc, vcc_lo, v255 // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v1, v255 @@ -465,16 +465,16 @@ v_cmp_ne_u16 vcc, vcc_hi, v255 v_cmp_ne_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v1, v255 +v_cmp_ne_u16 vcc, v1, v255 // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v127, v255 +v_cmp_ne_u16 vcc, v127, v255 // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, vcc_hi, v255 +v_cmp_ne_u16 vcc, vcc_hi, v255 // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, vcc_lo, v255 +v_cmp_ne_u16 vcc, vcc_lo, v255 // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v1, v255 @@ -489,16 +489,16 @@ v_cmp_neq_f16 vcc, vcc_hi, v255 v_cmp_neq_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v1, v255 +v_cmp_neq_f16 vcc, v1, v255 // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v127, v255 +v_cmp_neq_f16 vcc, v127, v255 // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, vcc_hi, v255 +v_cmp_neq_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, vcc_lo, v255 +v_cmp_neq_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v1, v255 @@ -513,16 +513,16 @@ v_cmp_nge_f16 vcc, vcc_hi, v255 v_cmp_nge_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v1, v255 +v_cmp_nge_f16 vcc, v1, v255 // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v127, v255 +v_cmp_nge_f16 vcc, v127, v255 // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, vcc_hi, v255 +v_cmp_nge_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, vcc_lo, v255 +v_cmp_nge_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v1, v255 @@ -537,16 +537,16 @@ v_cmp_ngt_f16 vcc, vcc_hi, v255 v_cmp_ngt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v1, v255 +v_cmp_ngt_f16 vcc, v1, v255 // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v127, v255 +v_cmp_ngt_f16 vcc, v127, v255 // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, vcc_hi, v255 +v_cmp_ngt_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, vcc_lo, v255 +v_cmp_ngt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v1, v255 @@ -561,16 +561,16 @@ v_cmp_nle_f16 vcc, vcc_hi, v255 v_cmp_nle_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v1, v255 +v_cmp_nle_f16 vcc, v1, v255 // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v127, v255 +v_cmp_nle_f16 vcc, v127, v255 // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, vcc_hi, v255 +v_cmp_nle_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, vcc_lo, v255 +v_cmp_nle_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v1, v255 @@ -585,16 +585,16 @@ v_cmp_nlg_f16 vcc, vcc_hi, v255 v_cmp_nlg_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v1, v255 +v_cmp_nlg_f16 vcc, v1, v255 // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v127, v255 +v_cmp_nlg_f16 vcc, v127, v255 // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, vcc_hi, v255 +v_cmp_nlg_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, vcc_lo, v255 +v_cmp_nlg_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v1, v255 @@ -609,16 +609,16 @@ v_cmp_nlt_f16 vcc, vcc_hi, v255 v_cmp_nlt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v1, v255 +v_cmp_nlt_f16 vcc, v1, v255 // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v127, v255 +v_cmp_nlt_f16 vcc, v127, v255 // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, vcc_hi, v255 +v_cmp_nlt_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, vcc_lo, v255 +v_cmp_nlt_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v1, v255 @@ -633,16 +633,16 @@ v_cmp_o_f16 vcc, vcc_hi, v255 v_cmp_o_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v1, v255 +v_cmp_o_f16 vcc, v1, v255 // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v127, v255 +v_cmp_o_f16 vcc, v127, v255 // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, vcc_hi, v255 +v_cmp_o_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, vcc_lo, v255 +v_cmp_o_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v1, v255 @@ -657,16 +657,16 @@ v_cmp_t_f16 vcc, vcc_hi, v255 v_cmp_t_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v1, v255 +v_cmp_t_f16 vcc, v1, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v127, v255 +v_cmp_t_f16 vcc, v127, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, vcc_hi, v255 +v_cmp_t_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, vcc_lo, v255 +v_cmp_t_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v1, v255 @@ -681,16 +681,16 @@ v_cmp_tru_f16 vcc, vcc_hi, v255 v_cmp_tru_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v1, v255 +v_cmp_tru_f16 vcc, v1, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v127, v255 +v_cmp_tru_f16 vcc, v127, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, vcc_hi, v255 +v_cmp_tru_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, vcc_lo, v255 +v_cmp_tru_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v1, v255 @@ -705,196 +705,196 @@ v_cmp_u_f16 vcc, vcc_hi, v255 v_cmp_u_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v1, v255 +v_cmp_u_f16 vcc, v1, v255 // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v127, v255 +v_cmp_u_f16 vcc, v127, v255 // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, vcc_hi, v255 +v_cmp_u_f16 vcc, vcc_hi, v255 // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, vcc_lo, v255 +v_cmp_u_f16 vcc, vcc_lo, v255 // GFX11: v_cmp_u_f16_e64 v_cmp_class_f16 vcc, v128, v2 // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v128, v2 +v_cmp_class_f16 vcc, v128, v2 // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v128, v2 // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v128, v2 +v_cmp_eq_f16 vcc, v128, v2 // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v128, v2 // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v128, v2 +v_cmp_eq_i16 vcc, v128, v2 // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v128, v2 // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v128, v2 +v_cmp_eq_u16 vcc, v128, v2 // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v128, v2 // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v128, v2 +v_cmp_f_f16 vcc, v128, v2 // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v128, v2 // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v128, v2 +v_cmp_ge_f16 vcc, v128, v2 // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v128, v2 // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v128, v2 +v_cmp_ge_i16 vcc, v128, v2 // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v128, v2 // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v128, v2 +v_cmp_ge_u16 vcc, v128, v2 // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v128, v2 // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v128, v2 +v_cmp_gt_f16 vcc, v128, v2 // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v128, v2 // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v128, v2 +v_cmp_gt_i16 vcc, v128, v2 // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v128, v2 // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v128, v2 +v_cmp_gt_u16 vcc, v128, v2 // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v128, v2 // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v128, v2 +v_cmp_le_f16 vcc, v128, v2 // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v128, v2 // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v128, v2 +v_cmp_le_i16 vcc, v128, v2 // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v128, v2 // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v128, v2 +v_cmp_le_u16 vcc, v128, v2 // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v128, v2 // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v128, v2 +v_cmp_lg_f16 vcc, v128, v2 // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v128, v2 // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v128, v2 +v_cmp_lt_f16 vcc, v128, v2 // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v128, v2 // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v128, v2 +v_cmp_lt_i16 vcc, v128, v2 // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v128, v2 // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v128, v2 +v_cmp_lt_u16 vcc, v128, v2 // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v128, v2 // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v128, v2 +v_cmp_ne_i16 vcc, v128, v2 // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v128, v2 // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v128, v2 +v_cmp_ne_u16 vcc, v128, v2 // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v128, v2 // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v128, v2 +v_cmp_neq_f16 vcc, v128, v2 // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v128, v2 // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v128, v2 +v_cmp_nge_f16 vcc, v128, v2 // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v128, v2 // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v128, v2 +v_cmp_ngt_f16 vcc, v128, v2 // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v128, v2 // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v128, v2 +v_cmp_nle_f16 vcc, v128, v2 // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v128, v2 // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v128, v2 +v_cmp_nlg_f16 vcc, v128, v2 // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v128, v2 // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v128, v2 +v_cmp_nlt_f16 vcc, v128, v2 // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v128, v2 // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v128, v2 +v_cmp_o_f16 vcc, v128, v2 // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v128, v2 // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v128, v2 +v_cmp_t_f16 vcc, v128, v2 // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v128, v2 // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v128, v2 +v_cmp_tru_f16 vcc, v128, v2 // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v128, v2 // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v128, v2 +v_cmp_u_f16 vcc, v128, v2 // GFX11: v_cmp_u_f16_e64 v_cmp_class_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -903,7 +903,7 @@ v_cmp_class_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_class_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_class_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -912,10 +912,10 @@ v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_eq_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_eq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_eq_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -924,10 +924,10 @@ v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_eq_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_eq_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_eq_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -936,10 +936,10 @@ v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_eq_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_eq_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_eq_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -948,10 +948,10 @@ v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_f_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_f_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_f_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -960,10 +960,10 @@ v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ge_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ge_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -972,10 +972,10 @@ v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ge_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ge_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ge_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -984,10 +984,10 @@ v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ge_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ge_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ge_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -996,10 +996,10 @@ v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_gt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_gt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_gt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1008,10 +1008,10 @@ v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_gt_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_gt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_gt_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1020,10 +1020,10 @@ v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_gt_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_gt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_gt_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1032,10 +1032,10 @@ v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_le_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_le_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_le_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1044,10 +1044,10 @@ v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_le_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_le_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_le_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1056,10 +1056,10 @@ v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_le_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_le_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_le_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1068,10 +1068,10 @@ v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_lg_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_lg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_lg_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1080,10 +1080,10 @@ v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_lt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_lt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_lt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1092,10 +1092,10 @@ v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_lt_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_lt_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_lt_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1104,10 +1104,10 @@ v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_lt_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_lt_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_lt_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1116,10 +1116,10 @@ v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ne_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ne_i16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ne_i16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1128,10 +1128,10 @@ v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ne_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ne_u16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ne_u16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1140,10 +1140,10 @@ v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_neq_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_neq_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_neq_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1152,10 +1152,10 @@ v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_nge_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_nge_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_nge_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1164,10 +1164,10 @@ v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_ngt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_ngt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_ngt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1176,10 +1176,10 @@ v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_nle_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_nle_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_nle_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1188,10 +1188,10 @@ v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_nlg_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_nlg_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_nlg_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1200,10 +1200,10 @@ v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_nlt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_nlt_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_nlt_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1212,10 +1212,10 @@ v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_o_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_o_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_o_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1224,10 +1224,10 @@ v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_t_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_t_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_t_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1236,10 +1236,10 @@ v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_tru_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_tru_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_tru_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0] @@ -1248,190 +1248,190 @@ v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0] v_cmp_u_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v1, v255 quad_perm:[3,2,1,0] +v_cmp_u_f16 vcc, v1, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v127, v255 quad_perm:[3,2,1,0] +v_cmp_u_f16 vcc, v127, v255 quad_perm:[3,2,1,0] // GFX11: v_cmp_u_f16_e64 v_cmp_class_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_class_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_eq_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_eq_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_eq_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_f_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ge_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ge_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ge_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_gt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_gt_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_gt_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_le_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_le_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_le_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_lg_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_lt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_lt_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_lt_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ne_i16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ne_u16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_neq_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_nge_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_ngt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_nle_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_nlg_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_nlt_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_o_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_t_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_tru_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v128, v2 quad_perm:[3,2,1,0] +v_cmp_u_f16 vcc, v128, v2 quad_perm:[3,2,1,0] // GFX11: v_cmp_u_f16_e64 v_cmp_class_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1440,7 +1440,7 @@ v_cmp_class_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_class_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_class_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1449,10 +1449,10 @@ v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_eq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1461,10 +1461,10 @@ v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_eq_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1473,10 +1473,10 @@ v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_eq_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1485,10 +1485,10 @@ v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_f_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_f_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_f_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1497,10 +1497,10 @@ v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1509,10 +1509,10 @@ v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ge_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1521,10 +1521,10 @@ v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ge_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1533,10 +1533,10 @@ v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_gt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1545,10 +1545,10 @@ v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_gt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1557,10 +1557,10 @@ v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_gt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1569,10 +1569,10 @@ v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_le_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1581,10 +1581,10 @@ v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_le_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1593,10 +1593,10 @@ v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_le_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1605,10 +1605,10 @@ v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_lg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1617,10 +1617,10 @@ v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_lt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1629,10 +1629,10 @@ v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_lt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1641,10 +1641,10 @@ v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_lt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1653,10 +1653,10 @@ v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ne_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_i16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_i16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1665,10 +1665,10 @@ v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ne_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_u16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_u16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1677,10 +1677,10 @@ v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_neq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_neq_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_neq_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1689,10 +1689,10 @@ v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_nge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nge_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nge_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1701,10 +1701,10 @@ v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_ngt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ngt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ngt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1713,10 +1713,10 @@ v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_nle_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nle_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nle_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1725,10 +1725,10 @@ v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_nlg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlg_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlg_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1737,10 +1737,10 @@ v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_nlt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlt_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlt_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1749,10 +1749,10 @@ v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_o_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_o_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_o_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1761,10 +1761,10 @@ v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_t_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_t_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_t_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1773,10 +1773,10 @@ v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_tru_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_tru_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_tru_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] @@ -1785,189 +1785,189 @@ v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] v_cmp_u_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_u_f16 vcc, v1, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v127, v255 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_u_f16 vcc, v127, v255 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_u_f16_e64 v_cmp_class_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_class_f16_e64 -v_cmp_class_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_class_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_class_f16_e64 v_cmp_eq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_f16_e64 -v_cmp_eq_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_f16_e64 v_cmp_eq_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_i16_e64 -v_cmp_eq_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_i16_e64 v_cmp_eq_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_u16_e64 -v_cmp_eq_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_eq_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_eq_u16_e64 v_cmp_f_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_f_f16_e64 -v_cmp_f_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_f_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_f_f16_e64 v_cmp_ge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_f16_e64 -v_cmp_ge_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_f16_e64 v_cmp_ge_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_i16_e64 -v_cmp_ge_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_i16_e64 v_cmp_ge_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_u16_e64 -v_cmp_ge_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ge_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ge_u16_e64 v_cmp_gt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_f16_e64 -v_cmp_gt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_f16_e64 v_cmp_gt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_i16_e64 -v_cmp_gt_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_i16_e64 v_cmp_gt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_u16_e64 -v_cmp_gt_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_gt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_gt_u16_e64 v_cmp_le_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_f16_e64 -v_cmp_le_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_f16_e64 v_cmp_le_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_i16_e64 -v_cmp_le_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_i16_e64 v_cmp_le_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_u16_e64 -v_cmp_le_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_le_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_le_u16_e64 v_cmp_lg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lg_f16_e64 -v_cmp_lg_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lg_f16_e64 v_cmp_lt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_f16_e64 -v_cmp_lt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_f16_e64 v_cmp_lt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_i16_e64 -v_cmp_lt_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_i16_e64 v_cmp_lt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_u16_e64 -v_cmp_lt_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_lt_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_lt_u16_e64 v_cmp_ne_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_i16_e64 -v_cmp_ne_i16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_i16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_i16_e64 v_cmp_ne_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_u16_e64 -v_cmp_ne_u16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ne_u16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ne_u16_e64 v_cmp_neq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_neq_f16_e64 -v_cmp_neq_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_neq_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_neq_f16_e64 v_cmp_nge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nge_f16_e64 -v_cmp_nge_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nge_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nge_f16_e64 v_cmp_ngt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 -v_cmp_ngt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_ngt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_ngt_f16_e64 v_cmp_nle_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nle_f16_e64 -v_cmp_nle_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nle_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nle_f16_e64 v_cmp_nlg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 -v_cmp_nlg_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlg_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlg_f16_e64 v_cmp_nlt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 -v_cmp_nlt_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_nlt_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_nlt_f16_e64 v_cmp_o_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_o_f16_e64 -v_cmp_o_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_o_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_o_f16_e64 v_cmp_t_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_t_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_t_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_tru_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 -v_cmp_tru_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_tru_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_t_f16_e64 v_cmp_u_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_u_f16_e64 -v_cmp_u_f16 vcc_lo, v128, v2 dpp8:[7,6,5,4,3,2,1,0] +v_cmp_u_f16 vcc, v128, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_cmp_u_f16_e64 diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s index d52034fedc3578..e90bb80f098cb0 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_e32 v1, v2 // GFX11: encoding: [0x01,0x05,0xfa,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s index b9903f51b332dc..d8fc1d3e2b3cd9 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] // GFX11: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s index 5f5d3c03038307..9db7e48809ee15 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX11: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopd.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopd.s index 0556861276b071..4b8ed488793db7 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vopd.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopd.s @@ -1,6 +1,6 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 // GFX11: encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_wmma.s b/llvm/test/MC/AMDGPU/gfx11_asm_wmma.s index 74cdd02a835119..ff49388c8ea467 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_wmma.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_wmma.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s // // Test v_wmma_f32_16x16x16_f16 diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index f447263c30223d..1e8d7684e942a6 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s buffer_atomic_add_f64 v[2:3], off, s[12:15], s4 offset:4095 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported_dpp.s b/llvm/test/MC/AMDGPU/gfx11_unsupported_dpp.s index e9e0c5210238e2..e823c000c0847b 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported_dpp.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported_dpp.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_u32_dpp v255, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported_e32.s b/llvm/test/MC/AMDGPU/gfx11_unsupported_e32.s index 21a3fbedb694da..352bf679883421 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported_e32.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported_e32.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_u32_e32 v2, vcc, s0, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: e32 variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported_e64.s b/llvm/test/MC/AMDGPU/gfx11_unsupported_e64.s index a4310f3b5378fe..784018a79bb6c3 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported_e64.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported_e64.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_dot2c_f32_f16_e64 v0, v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: e64 variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa-fake16.s b/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa-fake16.s index 4946b3d53b6787..73a1455da9cf14 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa-fake16.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_floor_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa.s b/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa.s index 5ae5b4ef20f5d1..a2d39f6e3e3644 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s v_add_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s index 842cc518034029..d19faf822e529a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s global_load_tr_b128 v[1:4], v0, s[0:1] offset:-64 // W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 5279588f050629..7021bd4dcdcc97 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s v_bfrev_b32_e32 v5, v1 // GFX12: encoding: [0x01,0x71,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index 29ae941a4e850c..d5cafcd4c38741 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 9c9e63f0335b7f..4c884018bc5a86 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s index 233af7e1b5d354..08d4be08813192 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo // W32: encoding: [0x01,0x05,0x0a,0x40] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s index 5976d8af01f327..3918dd48cfc063 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_min_f32 v5, v1, v2 // GFX12: v_min_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2a] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s index 548c9186ec78fe..63ffdbe821af8e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s index 72bf613e2a37db..54baafb5366ffb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index b0854881d42875..484e73da199b3f 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add3_u32 v5, v1, v2, s3 // GFX12: encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s index 3829e0601b1e3d..7a1ebbd9e19a8a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_min3_f32 v5, v1, v2, v3 // GFX12: v_min3_num_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index 16cd8d5aa5e9d5..91817b9029db39 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index d6ef14cff5fa83..3003d72b679688 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s index a9dd290ea67d87..39fa61f48ecc13 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --strict-whitespace --implicit-check-not=error %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --strict-whitespace --implicit-check-not=error %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --strict-whitespace --implicit-check-not=error %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --strict-whitespace --implicit-check-not=error %s v_permlane16_b32 v5, v1, s2, s3 op_sel:[0, 0, 0, 1] // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index 103fa67064ef1b..b6fcbebf6f3972 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX12 %s v_bfrev_b32_e64 v5, v1 // GFX12: encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index ae1381a5a72932..abf07052d0df5d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX12 %s v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index d88922c111f604..d27526285d18c6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX12 %s v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s index f78acd074d87fb..b514af8384a5ed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64 v5, s6, v1, 0xaf123456, s3 // W32: encoding: [0x05,0x06,0x20,0xd5,0x01,0xff,0x0d,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s index 2b7830c2804d12..d628ff10f279ba 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 quad_perm:[3,2,1,0] // W32: [0x05,0x06,0x20,0xd5,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s index b18029daaad3f7..26c2be50199b4e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x06,0x20,0xd5,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c.s index e03bb88e077104..98dbbf6cff448b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64 s5, v1, v2 // W32: encoding: [0x05,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s index 037fa392bfa613..8b090d4eb5ed63 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0] // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s index c5ba45e7f008eb..44b4e4f42317c2 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s index d0fe55ae0a7eef..7d311dc8afd0dc 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_e64 v1, v2 // GFX12: encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s index eae2d5b2d815fe..bb092927ac9b61 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s index d63ca0ceb3dc4f..8bfd9dce48a5b9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s index 79054e9c4063ba..5915cbc011863a 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_aliases.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_pk_min_f16 v0, v1, v2 // GFX12: v_pk_min_num_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x1b,0xcc,0x01,0x05,0x02,0x18] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s index 32378c545ab05f..73a12ef28032ae 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1 // GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s index dec1a7c512b837..e4366630cf8fd8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] // GFX12: encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopc.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopc.s index 3ae0c0f314d937..c9241ebd161de4 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopc.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopc.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_e32 vcc_lo, v1, v2 // W32: encoding: [0x01,0x05,0xfa,0x7c] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp16.s index acccb95a2be0f9..0c3a38626fa6c0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_dpp vcc_lo, v1, v2 quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp8.s index 9e5e7e3f88ead7..ceecbc660d06cb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16_dpp vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_t16_promote.s index 47e9b3a7ad7ea3..65c0a3c874efbd 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopc_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopc_t16_promote.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_cmp_class_f16 vcc, v1, v255 // W64: v_cmp_class_f16_e64 vcc, v1, v255 diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s index 68c41abaf05fec..4c5a8e638e3fbe 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_e32 v1, v2 // GFX12: encoding: [0x01,0x05,0xfa,0x7d] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s index 63974efa9fbd9c..9c29f5bcd714b1 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_dpp v1, v2 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s index 1a3e1fb1866a11..0f82932a9e34bb 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_cmpx_class_f16_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopd.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopd.s index 8df347cbc119d4..80b107982ae383 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vopd.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopd.s @@ -1,6 +1,6 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 // GFX12: encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s index 6a52cf5c2af8d1..893792799b0590 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s @@ -1,5 +1,5 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] // GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] diff --git a/llvm/test/MC/AMDGPU/vop3-literal.s b/llvm/test/MC/AMDGPU/vop3-literal.s index d97ded08769a4e..74b4d6d6860bc5 100644 --- a/llvm/test/MC/AMDGPU/vop3-literal.s +++ b/llvm/test/MC/AMDGPU/vop3-literal.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX10 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX10 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX10-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX10-ERR --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefix=GFX9-ERR --implicit-check-not=error: %s v_bfe_u32 v0, 0x3039, v1, s1 diff --git a/llvm/test/MC/AMDGPU/wave32.s b/llvm/test/MC/AMDGPU/wave32.s index c52693076e2c5e..25bb4fd84433bd 100644 --- a/llvm/test/MC/AMDGPU/wave32.s +++ b/llvm/test/MC/AMDGPU/wave32.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR --implicit-check-not=error: %s v_cmp_ge_i32_e32 s0, v0 // GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d] diff --git a/llvm/test/MC/AsmParser/sse2avx-att.s b/llvm/test/MC/AsmParser/sse2avx-att.s new file mode 100644 index 00000000000000..a452a5c611d3a2 --- /dev/null +++ b/llvm/test/MC/AsmParser/sse2avx-att.s @@ -0,0 +1,89 @@ +# RUN: llvm-mc -triple x86_64 -x86-sse2avx %s | FileCheck %s +# RUN: llvm-mc -triple=x86_64 -output-asm-variant=1 %s | llvm-mc -triple=x86_64 -x86-asm-syntax=intel -x86-sse2avx + .text +# CHECK: vmovsd -352(%rbp), %xmm0 + movsd -352(%rbp), %xmm0 # xmm0 = mem[0],zero +# CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0] + unpcklpd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +# CHECK-NEXT: vmovapd %xmm0, -368(%rbp) + movapd %xmm0, -368(%rbp) +# CHECK-NEXT: vmovapd -368(%rbp), %xmm0 + movapd -368(%rbp), %xmm0 +# CHECK-NEXT: vmovsd -376(%rbp), %xmm1 + movsd -376(%rbp), %xmm1 # xmm1 = mem[0],zero +# CHECK-NEXT: vmovsd -384(%rbp), %xmm0 + movsd -384(%rbp), %xmm0 # xmm0 = mem[0],zero +# CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0] + unpcklpd %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +# CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 + addpd %xmm1, %xmm0 +# CHECK-NEXT: vmovapd %xmm0, -464(%rbp) + movapd %xmm0, -464(%rbp) +# CHECK-NEXT: vmovaps -304(%rbp), %xmm1 + movaps -304(%rbp), %xmm1 +# CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 + pandn %xmm1, %xmm0 +# CHECK-NEXT: vmovaps %xmm0, -480(%rbp) + movaps %xmm0, -480(%rbp) +# CHECK-NEXT: vmovss -220(%rbp), %xmm1 + movss -220(%rbp), %xmm1 # xmm1 = mem[0],zero,zero,zero +# CHECK-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] + insertps $16, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +# CHECK-NEXT: vmovaps %xmm0, -496(%rbp) + movaps %xmm0, -496(%rbp) +# CHECK-NEXT: vmovss -256(%rbp), %xmm0 + movss -256(%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero +# CHECK-NEXT: vmovaps -192(%rbp), %xmm0 + movaps -192(%rbp), %xmm0 +# CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0 + divss %xmm1, %xmm0 +# CHECK-NEXT: vmovaps %xmm0, -192(%rbp) + movaps %xmm0, -192(%rbp) +# CHECK-NEXT: vmovd -128(%rbp), %xmm0 + movd -128(%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero +# CHECK-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 + pinsrd $1, %edx, %xmm0 +# CHECK-NEXT: vmovaps %xmm0, -144(%rbp) + movaps %xmm0, -144(%rbp) +# CHECK-NEXT: vmovd -160(%rbp), %xmm0 + movd -160(%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero +# CHECK-NEXT: vpblendw $170, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] + pblendw $170, %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +# CHECK-NEXT: vmovdqa %xmm0, -576(%rbp) + movdqa %xmm0, -576(%rbp) +# CHECK-NEXT: vphsubw %xmm1, %xmm0, %xmm0 + phsubw %xmm1, %xmm0 +# CHECK-NEXT: vmovdqa %xmm0, -592(%rbp) + movdqa %xmm0, -592(%rbp) +# CHECK-NEXT: vmovaps -496(%rbp), %xmm0 + movaps -496(%rbp), %xmm0 +# CHECK-NEXT: vroundps $8, %xmm0, %xmm0 + roundps $8, %xmm0, %xmm0 +# CHECK-NEXT: vmovaps %xmm0, -608(%rbp) + movaps %xmm0, -608(%rbp) +# CHECK-NEXT: vmovapd -432(%rbp), %xmm0 + movapd -432(%rbp), %xmm0 +# CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 + pxor %xmm1, %xmm0 +# CHECK-NEXT: vmovaps %xmm0, -640(%rbp) + movaps %xmm0, -640(%rbp) +# CHECK-NEXT: vmovapd -32(%rbp), %xmm0 + movapd -32(%rbp), %xmm0 +# CHECK-NEXT: vmovupd %xmm0, (%rax) + movupd %xmm0, (%rax) +# CHECK-NEXT: vmovsd -656(%rbp), %xmm0 + movsd -656(%rbp), %xmm0 # xmm0 = mem[0],zero +# CHECK-NEXT: extrq $16, $8, %xmm0 # xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] + extrq $16, $8, %xmm0 +# CHECK-NEXT: insertq $16, $8, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u] + insertq $16, $8, %xmm1, %xmm0 +# CHECK-NEXT: pshufw $1, %mm0, %mm2 # mm2 = mm0[1,0,0,0] + pshufw $1, %mm0, %mm2 +# CHECK-NEXT: vpblendvb %xmm2, %xmm2, %xmm1, %xmm1 + pblendvb %xmm0, %xmm2, %xmm1 +# CHECK-NEXT: vblendvps %xmm0, %xmm0, %xmm2, %xmm2 + blendvps %xmm0, %xmm0, %xmm2 +# CHECK-NEXT: vblendvpd %xmm0, %xmm0, %xmm2, %xmm2 + blendvpd %xmm0, %xmm0, %xmm2 +# CHECK-NEXT: vblendvpd %xmm0, %xmm0, %xmm2, %xmm2 + blendvpd %xmm0, %xmm2 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt index 78ca1bbdacf295..31fc10174bb0bf 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s # GFX1032: v_cmp_lt_f32_e32 vcc_lo, s2, v4 # GFX1064: v_cmp_lt_f32_e32 vcc, s2, v4 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt index b57ea682aaeafa..688b5f916630af 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_ds.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: ds_add_f32 v0, v1 ; encoding: [0x00,0x00,0x54,0xd8,0x00,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_flat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_flat.txt index 9754b8c597a17e..f8d31294ee9cc4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_flat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_flat.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s #===------------------------------------------------------------------------===# diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt index 849c89e37011f4..30b97a05210f1f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: buffer_atomic_add v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0xff,0x02,0x03] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt index 0417f94353be05..890a64b22f3993 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_atc_probe 7, s[4:5], 0x64 ; encoding: [0xc2,0x01,0x98,0xf4,0x64,0x00,0x00,0xfa] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop1.txt index b5578bb4fd200d..eae9b84c3a3305 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop1.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_abs_i32 exec_hi, s1 ; encoding: [0x01,0x34,0xff,0xbe] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop2.txt index 85e1de86513a1e..bc33e7494f9c13 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sop2.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_absdiff_i32 exec_hi, s1, s2 ; encoding: [0x01,0x02,0x7f,0x96] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopc.txt index baa5c5f06a1ff7..d556dd08356201 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopc.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_bitcmp0_b32 exec_hi, s1 ; encoding: [0x7f,0x01,0x0c,0xbf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopk.txt index 1878f8b4672276..cb59a54dbb5396 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopk.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopk.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_addk_i32 exec_hi, 0x1234 ; encoding: [0x34,0x12,0xff,0xb7] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopp.txt index 8022439c72d528..9b952bffcce951 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopp.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_sopp.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1.txt index ee1114f4ab32ba..ff8d39597bb706 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_bfrev_b32_e32 v255, v1 ; encoding: [0x01,0x71,0xfe,0x7f] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp16.txt index d80b2dd748281b..435a7440b29b90 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_bfrev_b32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x70,0xfe,0x7f,0x01,0xe4,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp8.txt index b354990ddfdefb..1ff94624c07164 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10 %s # GFX10: v_mov_b32_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0x02,0x0a,0x7e,0x01,0x88,0xc6,0xfa] 0xe9,0x02,0x0a,0x7e,0x01,0x88,0xc6,0xfa diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_sdwa.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_sdwa.txt index 0739ba3973f343..8db91e616209dc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_sdwa.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop1_sdwa.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_bfrev_b32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x70,0xfe,0x7f,0x01,0x06,0x06,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt index b759912204db82..fb1099d7099408 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s # W32: v_add_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x51] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt index 0d098496af0915..1774efa4a65c7e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s # GFX10: v_add_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0xfe,0x65,0x01,0xe4,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt index 819a27647c1bae..40b8f24e4d72fa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W64 %s # GFX10: v_add_f32_dpp v5, v1, v2 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x88,0xc6,0xfa] 0xe9,0x04,0x0a,0x06,0x01,0x88,0xc6,0xfa diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_sdwa.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_sdwa.txt index 3dece53d48d29e..146931d14a4aaa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_sdwa.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_sdwa.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s # W32: v_add_co_ci_u32_sdwa v255, vcc_lo, v1, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0xfe,0x51,0x01,0x06,0x06,0x06] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 4c0170ca4e4747..6da1423fe8278e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s # GFX10: v_add3_u32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x6d,0xd7,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3c.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3c.txt index f682c5aa65b161..9d93337136462b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3c.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3c.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s # W32: v_cmp_class_f32_e64 s10, -1, v2 ; encoding: [0x0a,0x00,0x88,0xd4,0xc1,0x04,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3cx.txt index d70f07dd835d71..8e51ae9305c99b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3cx.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_cmpx_class_f16_e64 -1, v2 ; encoding: [0x7e,0x00,0x9f,0xd4,0xc1,0x04,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt index 3aeb21349f6f06..2156a682337e8d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s # W32: v_cmp_class_f32_e32 vcc_lo, -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc_sdwa.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc_sdwa.txt index 5b9ff0333e98a1..a540c8757d77ff 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc_sdwa.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc_sdwa.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s # W32: v_cmp_class_f32_sdwa s100, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x10,0x7d,0x01,0xe4,0x06,0x06] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx.txt index 4ed855b97453d2..92e512958357de 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_cmpx_class_f16_e32 -1, v2 ; encoding: [0xc1,0x04,0x3e,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx_sdwa.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx_sdwa.txt index c94d883fcf9131..47bb30fe7f1b83 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx_sdwa.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopcx_sdwa.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX10 %s # GFX10: v_cmpx_eq_f16_sdwa -v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0xb4,0x7d,0x01,0x00,0x16,0x06] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt index fc35a2e6b4f8f4..0c1d538a22750c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s # GFX11: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt index 8b4dca4f5bd11e..486243c450d67c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s # GFX11: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop2.txt index a0838e09bf4578..a5d04cfb82ac46 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop2.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s # W32: v_add_co_ci_u32_e64_dpp v5, s12, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x20,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] # W64: v_add_co_ci_u32_e64_dpp v5, s[12:13], v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x20,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt index 78470d4707494d..e88aad33127579 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s # GFX11: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop2.txt index 5aa68cba4a393c..22b67ede42d64d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop2.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s # W32: v_add_co_ci_u32_e64_dpp v5, s12, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x20,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] # W64: v_add_co_ci_u32_e64_dpp v5, s[12:13], v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x20,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt index 139c097a2223c4..d7f6c8de471eac 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt @@ -1,7 +1,7 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s -# R UN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s +# R UN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,GFX11-FAKE16 %s # GFX11: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt index bd57ebfe3b5ed7..e37b307bf9dbad 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64 %s # W32: v_add_co_ci_u32_e64 v5, s12, v1, 0xaf123456, s6 ; encoding: [0x05,0x0c,0x20,0xd5,0x01,0xff,0x19,0x00,0x56,0x34,0x12,0xaf] # W64: v_add_co_ci_u32_e64 v5, s[12:13], v1, 0xaf123456, s[6:7] ; encoding: [0x05,0x0c,0x20,0xd5,0x01,0xff,0x19,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt index f426c1cd7e79f9..29e39c9e60ec87 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16_e32 vcc_lo, v1, v2 ; encoding: [0x01,0x05,0xfa,0x7c] # W64: v_cmp_class_f16_e32 vcc, v1, v2 ; encoding: [0x01,0x05,0xfa,0x7c] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt index 59d635bbe2bf32..460b222d0b7d9a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt index 29a55cb70dc483..18f9db15c980f8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopc_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] # W64: v_cmp_class_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt index b5538bf0a98f8b..863d747e9c0ba3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 # GFX11: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] 0x01,0x05,0xfa,0x7d diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt index a47440f5092d1b..e88d666fb3f935 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 # GFX11: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt index fd1c626ea51be9..6c51c9ba5a24f1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 # GFX11: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopd.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopd.txt index bd8f4c667d4ffa..222718b70f0d74 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopd.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopd.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX11 # GFX11: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff] 0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_wmma.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_wmma.txt index fff7a807bfd6c8..5d6d6b6cc3d162 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_wmma.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_wmma.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=W32 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=W64 %s # Test v_wmma_f32_16x16x16_f16 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt index b561c76571afb6..a9e14899645984 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s -# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s +# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=W32 %s # W32: global_load_tr_b128 v[1:4], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] # W64: global_load_tr_b128 v[1:2], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt index d54f819c5a063f..6637a88c3eee5a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_bfrev_b32_e32 v5, v1 ; encoding: [0x01,0x71,0x0a,0x7e] 0x01,0x71,0x0a,0x7e diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index 09e0a925d90e54..ac45962e1743e4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 8329ae4cefc9a7..957c425008c872 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt index 1021298b276ee7..673db0664fc6ab 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x40] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt index 337ae1a13970b3..05c8dff02a40b4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt index 0cb02f4de30b32..2e33df35af1f36 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 2c911777ef97f4..44cbe5f31b2cf2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # GFX12: v_add3_u32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index f9b6c1b73ddc4f..f6bb2e4a55282b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index eedc6d49108782..f291795c8a627b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2.txt index 6802d790f576f1..1312a6a34ae4dd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_e64 v5, s12, v1, 0xaf123456, s6 ; encoding: [0x05,0x0c,0x20,0xd5,0x01,0xff,0x19,0x00,0x56,0x34,0x12,0xaf] # W64: v_add_co_ci_u32_e64 v5, s[12:13], v1, 0xaf123456, s[6:7] ; encoding: [0x05,0x0c,0x20,0xd5,0x01,0xff,0x19,0x00,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt index b10b8da751624f..92dba4e7343087 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_e64_dpp v5, s12, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x20,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] # W64: v_add_co_ci_u32_e64_dpp v5, s[12:13], v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0c,0x20,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt index f78106e47d0b6b..d97fa3f513f532 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_add_co_ci_u32_e64_dpp v5, s12, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x20,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] # W64: v_add_co_ci_u32_e64_dpp v5, s[12:13], v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0c,0x20,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt index 90dca108a36b50..057e81c6fe27fa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_cmp_class_f16_e64 s10, v1, v2 ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00] # W64: v_cmp_class_f16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt index 13e34ca705786e..564312f579bce5 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt index f36857befe8abf..1d2169efa5ac00 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt index 28f2bea07e574d..f8efb827361178 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s # GFX12: v_cmpx_class_f16_e64 v1, v2 ; encoding: [0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00] 0x7e,0x00,0xfd,0xd4,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt index 0f933f03ba220b..ce56a87570ec3d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt index bf4f971bac7352..0184f667640b53 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt index 53c4350a7dc16e..1476bef59f2462 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_dot2_f32_bf16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c] 0x05,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt index 426e70103d9393..d97418b279b749 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe] 0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt index 6f4290cfdb12e6..1461065d1c5420 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05] 0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt index d9a75d25c1f208..320e85238e36e4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16_e32 vcc_lo, v1, v2 ; encoding: [0x01,0x05,0xfa,0x7c] # W64: v_cmp_class_f16_e32 vcc, v1, v2 ; encoding: [0x01,0x05,0xfa,0x7c] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt index 4afd8b1c8a3272..be40d5fc8af9ea 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16 vcc_lo, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] # W64: v_cmp_class_f16 vcc, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7c,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt index 906da92b9afc70..3fb7bef6c18aeb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopc_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W32 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=W64 # W32: v_cmp_class_f16 vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] # W64: v_cmp_class_f16 vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7c,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt index 6588cdd0fba0ac..5a3b1f183ebfa7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 # GFX12: v_cmpx_class_f16_e32 v1, v2 ; encoding: [0x01,0x05,0xfa,0x7d] 0x01,0x05,0xfa,0x7d diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt index 14f0ca72967134..704a17f8b091fc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 # GFX12: v_cmpx_class_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff] 0xfa,0x04,0xfa,0x7d,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt index 7ca10f5bf3d304..0b030b9e316ea4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 # GFX12: v_cmpx_class_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05] 0xe9,0x04,0xfa,0x7d,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopd.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopd.txt index 26bb8b8f92b0f1..51c2906608c837 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopd.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopd.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX12 # GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff] 0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt index 4fc236da131b11..1fb7613852aa53 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt @@ -1,5 +1,5 @@ -# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s -# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s +# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] # GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c] diff --git a/llvm/test/MC/Disassembler/SystemZ/insns.txt b/llvm/test/MC/Disassembler/SystemZ/insns.txt index 23714dfc3a8e6e..392993d4e2a36a 100644 --- a/llvm/test/MC/Disassembler/SystemZ/insns.txt +++ b/llvm/test/MC/Disassembler/SystemZ/insns.txt @@ -1315,22 +1315,22 @@ # CHECK: bassm %r15, %r1 0x0c 0xf1 -# CHECK: bc 0, 0 +# CHECK: nop 0 0x47 0x00 0x00 0x00 -# CHECK: bc 0, 4095 +# CHECK: nop 4095 0x47 0x00 0x0f 0xff -# CHECK: bc 0, 0(%r1) +# CHECK: nop 0(%r1) 0x47 0x00 0x10 0x00 -# CHECK: bc 0, 0(%r15) +# CHECK: nop 0(%r15) 0x47 0x00 0xf0 0x00 -# CHECK: bc 0, 4095(%r1,%r15) +# CHECK: nop 4095(%r1,%r15) 0x47 0x01 0xff 0xff -# CHECK: bc 0, 4095(%r15,%r1) +# CHECK: nop 4095(%r15,%r1) 0x47 0x0f 0x1f 0xff # CHECK: bo 0(%r13) @@ -1375,9 +1375,12 @@ # CHECK: bno 0 0x47 0xe0 0x00 0x00 -# CHECK: bcr 0, %r14 +# CHECK: nopr %r14 0x07 0x0e +# CHECK: nopr %r7 +0x07 0x07 + # CHECK: bor %r13 0x07 0x1d @@ -13968,11 +13971,11 @@ # CHECK: risbg %r0, %r0, 0, 0, 0 0xec 0x00 0x00 0x00 0x00 0x55 -# CHECK: risbg %r0, %r0, 0, 0, 63 -0xec 0x00 0x00 0x00 0x3f 0x55 +# CHECK: risbg %r0, %r0, 0, 0, 255 +0xec 0x00 0x00 0x00 0xff 0x55 -# CHECK: risbg %r0, %r0, 0, 255, 0 -0xec 0x00 0x00 0xff 0x00 0x55 +# CHECK: risbg %r0, %r0, 0, 127, 0 +0xec 0x00 0x00 0x7f 0x00 0x55 # CHECK: risbg %r0, %r0, 255, 0, 0 0xec 0x00 0xff 0x00 0x00 0x55 @@ -13986,14 +13989,17 @@ # CHECK: risbg %r4, %r5, 6, 7, 8 0xec 0x45 0x06 0x07 0x08 0x55 +# CHECK: risbgz %r4, %r5, 6, 7, 8 +0xec 0x45 0x06 0x87 0x08 0x55 + # CHECK: risbgn %r0, %r0, 0, 0, 0 0xec 0x00 0x00 0x00 0x00 0x59 -# CHECK: risbgn %r0, %r0, 0, 0, 63 -0xec 0x00 0x00 0x00 0x3f 0x59 +# CHECK: risbgn %r0, %r0, 0, 0, 255 +0xec 0x00 0x00 0x00 0xff 0x59 -# CHECK: risbgn %r0, %r0, 0, 255, 0 -0xec 0x00 0x00 0xff 0x00 0x59 +# CHECK: risbgn %r0, %r0, 0, 127, 0 +0xec 0x00 0x00 0x7f 0x00 0x59 # CHECK: risbgn %r0, %r0, 255, 0, 0 0xec 0x00 0xff 0x00 0x00 0x59 @@ -14007,6 +14013,9 @@ # CHECK: risbgn %r4, %r5, 6, 7, 8 0xec 0x45 0x06 0x07 0x08 0x59 +# CHECK: risbgnz %r4, %r5, 6, 7, 8 +0xec 0x45 0x06 0x87 0x08 0x59 + # CHECK: risbhg %r0, %r0, 0, 0, 0 0xec 0x00 0x00 0x00 0x00 0x5d diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 7d0858ff2ebba0..0ba15cfd489cb1 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -408,11 +408,11 @@ .attribute arch, "rv32i_xcvbi" # CHECK: attribute 5, "rv32i2p1_xcvbi1p0" -.attribute arch, "rv32i_zicfilp0p4" -# CHECK: attribute 5, "rv32i2p1_zicfilp0p4_zicsr2p0" +.attribute arch, "rv32i_zicfilp1p0" +# CHECK: attribute 5, "rv32i2p1_zicfilp1p0_zicsr2p0" -.attribute arch, "rv32i_zicfiss0p4" -# CHECK: .attribute 5, "rv32i2p1_zicfiss0p4_zicsr2p0_zimop1p0" +.attribute arch, "rv32i_zicfiss1p0" +# CHECK: .attribute 5, "rv32i2p1_zicfiss1p0_zicsr2p0_zimop1p0" .attribute arch, "rv64i_xsfvfwmaccqqq" # CHECK: attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0_xsfvfwmaccqqq1p0" diff --git a/llvm/test/MC/SystemZ/insn-good-z15.s b/llvm/test/MC/SystemZ/insn-good-z15.s index 36476161ea46de..108f8421162313 100644 --- a/llvm/test/MC/SystemZ/insn-good-z15.s +++ b/llvm/test/MC/SystemZ/insn-good-z15.s @@ -146,24 +146,28 @@ #CHECK: nogrk %r0, %r15, %r0 # encoding: [0xb9,0x66,0x00,0x0f] #CHECK: nogrk %r15, %r0, %r0 # encoding: [0xb9,0x66,0x00,0xf0] #CHECK: nogrk %r7, %r8, %r9 # encoding: [0xb9,0x66,0x90,0x78] +#CHECK: notgr %r7, %r8 # encoding: [0xb9,0x66,0x80,0x78] nogrk %r0,%r0,%r0 nogrk %r0,%r0,%r15 nogrk %r0,%r15,%r0 nogrk %r15,%r0,%r0 nogrk %r7,%r8,%r9 + notgr %r7,%r8 #CHECK: nork %r0, %r0, %r0 # encoding: [0xb9,0x76,0x00,0x00] #CHECK: nork %r0, %r0, %r15 # encoding: [0xb9,0x76,0xf0,0x00] #CHECK: nork %r0, %r15, %r0 # encoding: [0xb9,0x76,0x00,0x0f] #CHECK: nork %r15, %r0, %r0 # encoding: [0xb9,0x76,0x00,0xf0] #CHECK: nork %r7, %r8, %r9 # encoding: [0xb9,0x76,0x90,0x78] +#CHECK: notr %r7, %r8 # encoding: [0xb9,0x76,0x80,0x78] nork %r0,%r0,%r0 nork %r0,%r0,%r15 nork %r0,%r15,%r0 nork %r15,%r0,%r0 nork %r7,%r8,%r9 + notr %r7,%r8 #CHECK: nxgrk %r0, %r0, %r0 # encoding: [0xb9,0x67,0x00,0x00] #CHECK: nxgrk %r0, %r0, %r15 # encoding: [0xb9,0x67,0xf0,0x00] diff --git a/llvm/test/MC/SystemZ/insn-good-z196.s b/llvm/test/MC/SystemZ/insn-good-z196.s index fc90b18e66d8f1..d2a7724d3a9a25 100644 --- a/llvm/test/MC/SystemZ/insn-good-z196.s +++ b/llvm/test/MC/SystemZ/insn-good-z196.s @@ -276,10 +276,13 @@ #CHECK: brcth %r7, frob@PLT # encoding: [0xcc,0x76,A,A,A,A] # fixup A - offset: 2, value: frob@PLT+2, kind: FK_390_PC32DBL #CHECK: brcth %r8, frob@PLT # encoding: [0xcc,0x86,A,A,A,A] +# fixup A - offset: 2, value: frob@PLT+2, kind: FK_390_PC32DBL +#CHECK: brcth %r8, frob@PLT # encoding: [0xcc,0x86,A,A,A,A] # fixup A - offset: 2, value: frob@PLT+2, kind: FK_390_PC32DBL brcth %r7,frob@PLT brcth %r8,frob@PLT + jcth %r8,frob@PLT #CHECK: cdfbra %f0, 0, %r0, 0 # encoding: [0xb3,0x95,0x00,0x00] #CHECK: cdfbra %f0, 0, %r0, 15 # encoding: [0xb3,0x95,0x0f,0x00] diff --git a/llvm/test/MC/SystemZ/insn-good-zEC12.s b/llvm/test/MC/SystemZ/insn-good-zEC12.s index db37d28686e9bb..a564491c6c36fa 100644 --- a/llvm/test/MC/SystemZ/insn-good-zEC12.s +++ b/llvm/test/MC/SystemZ/insn-good-zEC12.s @@ -462,21 +462,33 @@ #CHECK: risbgn %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x59] #CHECK: risbgn %r0, %r0, 0, 0, 64 # encoding: [0xec,0x00,0x00,0x00,0x40,0x59] #CHECK: risbgn %r0, %r0, 0, 0, 255 # encoding: [0xec,0x00,0x00,0x00,0xff,0x59] -#CHECK: risbgn %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x59] +#CHECK: risbgnz %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x59] +#CHECK: risbgn %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x59] #CHECK: risbgn %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x59] +#CHECK: risbgn %r0, %r0, 0, 0, 127 # encoding: [0xec,0x00,0x00,0x00,0x7f,0x59] +#CHECK: risbgnz %r0, %r0, 0, 127, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x59] +#CHECK: risbgn %r0, %r0, 0, 127, 0 # encoding: [0xec,0x00,0x00,0x7f,0x00,0x59] +#CHECK: risbgn %r0, %r0, 127, 0, 0 # encoding: [0xec,0x00,0x7f,0x00,0x00,0x59] #CHECK: risbgn %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x59] #CHECK: risbgn %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x59] #CHECK: risbgn %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x59] +#CHECK: risbgnz %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x87,0x08,0x59] risbgn %r0,%r0,0,0,0 risbgn %r0,%r0,0,0,63 risbgn %r0,%r0,0,0,64 risbgn %r0,%r0,0,0,255 + risbgnz %r0,%r0,0,255,0 risbgn %r0,%r0,0,255,0 risbgn %r0,%r0,255,0,0 + risbgn %r0,%r0,0,0,127 + risbgnz %r0,%r0,0,127,0 + risbgn %r0,%r0,0,127,0 + risbgn %r0,%r0,127,0,0 risbgn %r0,%r15,0,0,0 risbgn %r15,%r0,0,0,0 risbgn %r4,%r5,6,7,8 + risbgnz %r4,%r5,6,7,8 #CHECK: tabort 0 # encoding: [0xb2,0xfc,0x00,0x00] #CHECK: tabort 0(%r1) # encoding: [0xb2,0xfc,0x10,0x00] diff --git a/llvm/test/MC/SystemZ/insn-good.s b/llvm/test/MC/SystemZ/insn-good.s index 2add4a108319e6..9fcb8a42cd73c8 100644 --- a/llvm/test/MC/SystemZ/insn-good.s +++ b/llvm/test/MC/SystemZ/insn-good.s @@ -1441,6 +1441,8 @@ jo foo bro foo +#CHECK: brc 2, foo # encoding: [0xa7,0x24,A,A] +#CHECK: fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL #CHECK: brc 2, foo # encoding: [0xa7,0x24,A,A] #CHECK: fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL #CHECK: jh foo # encoding: [0xa7,0x24,A,A] @@ -1452,6 +1454,7 @@ #CHECK: jp foo # encoding: [0xa7,0x24,A,A] #CHECK: fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL brc 2, foo + jc 2, foo jh foo jp foo brh foo @@ -8694,11 +8697,13 @@ #CHECK: iilf %r0, 0 # encoding: [0xc0,0x09,0x00,0x00,0x00,0x00] #CHECK: iilf %r0, 4294967295 # encoding: [0xc0,0x09,0xff,0xff,0xff,0xff] +#CHECK: iilf %r15, 0 # encoding: [0xc0,0xf9,0x00,0x00,0x00,0x00] #CHECK: iilf %r15, 0 # encoding: [0xc0,0xf9,0x00,0x00,0x00,0x00] iilf %r0, 0 iilf %r0, 0xffffffff iilf %r15, 0 + lfi %r15, 0 #CHECK: iilh %r0, 0 # encoding: [0xa5,0x02,0x00,0x00] #CHECK: iilh %r0, 32768 # encoding: [0xa5,0x02,0x80,0x00] @@ -10335,11 +10340,13 @@ #CHECK: llilf %r0, 0 # encoding: [0xc0,0x0f,0x00,0x00,0x00,0x00] #CHECK: llilf %r0, 4294967295 # encoding: [0xc0,0x0f,0xff,0xff,0xff,0xff] +#CHECK: llilf %r15, 0 # encoding: [0xc0,0xff,0x00,0x00,0x00,0x00] #CHECK: llilf %r15, 0 # encoding: [0xc0,0xff,0x00,0x00,0x00,0x00] llilf %r0, 0 llilf %r0, 0xffffffff llilf %r15, 0 + llgfi %r15, 0 #CHECK: llilh %r0, 0 # encoding: [0xa5,0x0e,0x00,0x00] #CHECK: llilh %r0, 32768 # encoding: [0xa5,0x0e,0x80,0x00] @@ -10354,12 +10361,14 @@ #CHECK: llill %r0, 0 # encoding: [0xa5,0x0f,0x00,0x00] #CHECK: llill %r0, 32768 # encoding: [0xa5,0x0f,0x80,0x00] #CHECK: llill %r0, 65535 # encoding: [0xa5,0x0f,0xff,0xff] +#CHECK: llill %r15, 0 # encoding: [0xa5,0xff,0x00,0x00] #CHECK: llill %r15, 0 # encoding: [0xa5,0xff,0x00,0x00] llill %r0, 0 llill %r0, 0x8000 llill %r0, 0xffff llill %r15, 0 + llghi %r15, 0 #CHECK: lm %r0, %r0, 0 # encoding: [0x98,0x00,0x00,0x00] #CHECK: lm %r0, %r15, 0 # encoding: [0x98,0x0f,0x00,0x00] @@ -13122,10 +13131,10 @@ niy 524287(%r1), 42 niy 524287(%r15), 42 -#CHECK: bc 0, 0 # encoding: [0x47,0x00,0x00,0x00] +#CHECK: nop 0 # encoding: [0x47,0x00,0x00,0x00] #CHECK: nop # encoding: [0x47,0x00,0x00,0x00] -#CHECK: bcr 0, %r7 # encoding: [0x07,0x07] -#CHECK: bcr 0, %r0 # encoding: [0x07,0x00] +#CHECK: nopr %r7 # encoding: [0x07,0x07] +#CHECK: nopr %r0 # encoding: [0x07,0x00] nop 0 nop @@ -13680,21 +13689,33 @@ #CHECK: risbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x55] #CHECK: risbg %r0, %r0, 0, 0, 64 # encoding: [0xec,0x00,0x00,0x00,0x40,0x55] #CHECK: risbg %r0, %r0, 0, 0, 255 # encoding: [0xec,0x00,0x00,0x00,0xff,0x55] -#CHECK: risbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x55] +#CHECK: risbgz %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x55] +#CHECK: risbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x55] #CHECK: risbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x55] +#CHECK: risbg %r0, %r0, 0, 0, 127 # encoding: [0xec,0x00,0x00,0x00,0x7f,0x55] +#CHECK: risbgz %r0, %r0, 0, 127, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x55] +#CHECK: risbg %r0, %r0, 0, 127, 0 # encoding: [0xec,0x00,0x00,0x7f,0x00,0x55] +#CHECK: risbg %r0, %r0, 127, 0, 0 # encoding: [0xec,0x00,0x7f,0x00,0x00,0x55] #CHECK: risbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x55] #CHECK: risbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x55] #CHECK: risbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x55] +#CHECK: risbgz %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x87,0x08,0x55] risbg %r0,%r0,0,0,0 risbg %r0,%r0,0,0,63 risbg %r0,%r0,0,0,64 risbg %r0,%r0,0,0,255 + risbgz %r0,%r0,0,255,0 risbg %r0,%r0,0,255,0 risbg %r0,%r0,255,0,0 + risbg %r0,%r0,0,0,127 + risbgz %r0,%r0,0,127,0 + risbg %r0,%r0,0,127,0 + risbg %r0,%r0,127,0,0 risbg %r0,%r15,0,0,0 risbg %r15,%r0,0,0,0 risbg %r4,%r5,6,7,8 + risbgz %r4,%r5,6,7,8 #CHECK: rll %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x1d] #CHECK: rll %r15, %r1, 0 # encoding: [0xeb,0xf1,0x00,0x00,0x00,0x1d] diff --git a/llvm/test/MC/X86/intel-syntax-expr.s b/llvm/test/MC/X86/intel-syntax-expr.s new file mode 100644 index 00000000000000..8aa083107dd85b --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-expr.s @@ -0,0 +1,8 @@ +// RUN: not llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s 2>&1 | FileCheck %s + +// When the intel syntax is enabled, to parse an operand, X86AsmParser doesn't use the method parseExpression from AsmParser +// but ParseIntelExpr which was not processing well an end of statement. + +// CHECK: error: unknown token in expression +test: +i- diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td index fb58448d7ce881..7137cf96fd3d44 100644 --- a/llvm/test/TableGen/riscv-target-def.td +++ b/llvm/test/TableGen/riscv-target-def.td @@ -153,13 +153,13 @@ def ROCKET : RISCVTuneProcessorModel<"rocket", // CHECK: #endif // GET_SUPPORTED_PROFILES // CHECK: #ifndef PROC -// CHECK-NEXT: #define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS) +// CHECK-NEXT: #define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_SCALAR_UNALIGN, FAST_VECTOR_UNALIGN) // CHECK-NEXT: #endif -// CHECK: PROC(GENERIC_RV32, {"generic-rv32"}, {"rv32i2p1"}, 0) -// CHECK-NEXT: PROC(GENERIC_RV64, {"generic-rv64"}, {"rv64i2p1"}, 0) -// CHECK-NEXT: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0) -// CHECK-NEXT: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0) +// CHECK: PROC(GENERIC_RV32, {"generic-rv32"}, {"rv32i2p1"}, 0, 0) +// CHECK-NEXT: PROC(GENERIC_RV64, {"generic-rv64"}, {"rv64i2p1"}, 0, 0) +// CHECK-NEXT: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0, 0) +// CHECK-NEXT: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0, 0) // CHECK: #undef PROC diff --git a/llvm/test/Transforms/AggressiveInstCombine/memchr.ll b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll new file mode 100644 index 00000000000000..2601b9f05a97f9 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=aggressive-instcombine --memchr-inline-threshold=5 < %s | FileCheck %s + +@str = constant [5 x i8] c"01\002\00", align 1 +@str_long = constant [8 x i8] c"0123456\00", align 1 + +declare ptr @memchr(ptr, i32, i64) + +define i1 @test_memchr_null(i32 %x) { +; CHECK-LABEL: define i1 @test_memchr_null( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [ +; CHECK-NEXT: i8 48, label %[[MEMCHR_CASE:.*]] +; CHECK-NEXT: i8 49, label %[[MEMCHR_CASE1:.*]] +; CHECK-NEXT: i8 0, label %[[MEMCHR_CASE2:.*]] +; CHECK-NEXT: i8 50, label %[[MEMCHR_CASE3:.*]] +; CHECK-NEXT: ] +; CHECK: [[MEMCHR_CASE]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS:.*]] +; CHECK: [[MEMCHR_CASE1]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_CASE2]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_CASE3]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_SUCCESS]]: +; CHECK-NEXT: [[MEMCHR_IDX:%.*]] = phi i64 [ 0, %[[MEMCHR_CASE]] ], [ 1, %[[MEMCHR_CASE1]] ], [ 2, %[[MEMCHR_CASE2]] ], [ 3, %[[MEMCHR_CASE3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @str, i64 [[MEMCHR_IDX]] +; CHECK-NEXT: br label %[[ENTRY_SPLIT]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: [[MEMCHR4:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[TMP1]], %[[MEMCHR_SUCCESS]] ] +; CHECK-NEXT: [[ISNULL:%.*]] = icmp eq ptr [[MEMCHR4]], null +; CHECK-NEXT: ret i1 [[ISNULL]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 %x, i64 5) + %isnull = icmp eq ptr %memchr, null + ret i1 %isnull +} + +define ptr @test_memchr(i32 %x) { +; CHECK-LABEL: define ptr @test_memchr( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [ +; CHECK-NEXT: i8 48, label %[[MEMCHR_CASE:.*]] +; CHECK-NEXT: i8 49, label %[[MEMCHR_CASE1:.*]] +; CHECK-NEXT: i8 0, label %[[MEMCHR_CASE2:.*]] +; CHECK-NEXT: i8 50, label %[[MEMCHR_CASE3:.*]] +; CHECK-NEXT: ] +; CHECK: [[MEMCHR_CASE]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS:.*]] +; CHECK: [[MEMCHR_CASE1]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_CASE2]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_CASE3]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_SUCCESS]]: +; CHECK-NEXT: [[MEMCHR_IDX:%.*]] = phi i64 [ 0, %[[MEMCHR_CASE]] ], [ 1, %[[MEMCHR_CASE1]] ], [ 2, %[[MEMCHR_CASE2]] ], [ 3, %[[MEMCHR_CASE3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @str, i64 [[MEMCHR_IDX]] +; CHECK-NEXT: br label %[[ENTRY_SPLIT]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: [[MEMCHR4:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[TMP1]], %[[MEMCHR_SUCCESS]] ] +; CHECK-NEXT: ret ptr [[MEMCHR4]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 %x, i64 5) + ret ptr %memchr +} + +define ptr @test_memchr_smaller_n(i32 %x) { +; CHECK-LABEL: define ptr @test_memchr_smaller_n( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [ +; CHECK-NEXT: i8 48, label %[[MEMCHR_CASE:.*]] +; CHECK-NEXT: i8 49, label %[[MEMCHR_CASE1:.*]] +; CHECK-NEXT: i8 0, label %[[MEMCHR_CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[MEMCHR_CASE]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS:.*]] +; CHECK: [[MEMCHR_CASE1]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_CASE2]]: +; CHECK-NEXT: br label %[[MEMCHR_SUCCESS]] +; CHECK: [[MEMCHR_SUCCESS]]: +; CHECK-NEXT: [[MEMCHR_IDX:%.*]] = phi i64 [ 0, %[[MEMCHR_CASE]] ], [ 1, %[[MEMCHR_CASE1]] ], [ 2, %[[MEMCHR_CASE2]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr @str, i64 [[MEMCHR_IDX]] +; CHECK-NEXT: br label %[[ENTRY_SPLIT]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: [[MEMCHR3:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[TMP1]], %[[MEMCHR_SUCCESS]] ] +; CHECK-NEXT: ret ptr [[MEMCHR3]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 %x, i64 3) + ret ptr %memchr +} + +; negative tests + +define ptr @test_memchr_larger_n(i32 %x) { +; CHECK-LABEL: define ptr @test_memchr_larger_n( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i64 6) +; CHECK-NEXT: ret ptr [[MEMCHR]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 %x, i64 6) + ret ptr %memchr +} + +define ptr @test_memchr_non_constant(i32 %x, ptr %str) { +; CHECK-LABEL: define ptr @test_memchr_non_constant( +; CHECK-SAME: i32 [[X:%.*]], ptr [[STR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MEMCHR:%.*]] = call ptr @memchr(ptr [[STR]], i32 [[X]], i64 5) +; CHECK-NEXT: ret ptr [[MEMCHR]] +; +entry: + %memchr = call ptr @memchr(ptr %str, i32 %x, i64 5) + ret ptr %memchr +} + +define ptr @test_memchr_constant_ch() { +; CHECK-LABEL: define ptr @test_memchr_constant_ch() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 49, i64 5) +; CHECK-NEXT: ret ptr [[MEMCHR]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 49, i64 5) + ret ptr %memchr +} + +define ptr @test_memchr_dynamic_n(i32 %x, i32 %y) { +; CHECK-LABEL: define ptr @test_memchr_dynamic_n( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret ptr [[MEMCHR]] +; +entry: + %memchr = call ptr @memchr(ptr @str, i32 %x, i32 %y) + ret ptr %memchr +} + +define ptr @test_memchr_long(i32 %x) { +; CHECK-LABEL: define ptr @test_memchr_long( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MEMCHR:%.*]] = call ptr @memchr(ptr @str_long, i32 [[X]], i64 8) +; CHECK-NEXT: ret ptr [[MEMCHR]] +; +entry: + %memchr = call ptr @memchr(ptr @str_long, i32 %x, i64 8) + ret ptr %memchr +} diff --git a/llvm/test/Transforms/EarlyCSE/math-2.ll b/llvm/test/Transforms/EarlyCSE/math-2.ll index d9f7c619fa0137..0d55165e3662fa 100644 --- a/llvm/test/Transforms/EarlyCSE/math-2.ll +++ b/llvm/test/Transforms/EarlyCSE/math-2.ll @@ -98,4 +98,22 @@ define double @i_powi() { ret double %res } +; Make sure that the type is correct after constant folding + +define half @pr98665() { +; CHECK-LABEL: @pr98665( +; CHECK-NEXT: ret half 0xH3C00 +; + %x = call half @llvm.powi.f16.i32(half 0xH3C00, i32 1) + ret half %x +} + +define float @powi_f32() { +; CHECK-LABEL: @powi_f32( +; CHECK-NEXT: ret float 0.000000e+00 +; + %y = call float @llvm.powi.f32.i32(float 0.0, i32 10) + ret float %y +} + attributes #0 = { nofree nounwind willreturn } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll b/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll new file mode 100644 index 00000000000000..d9af665e663f3f --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/select-from-load.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s +; REQUIRES: amdgpu-registered-target + +target triple = "amdgcn-amd-amdhsa" + +%anon = type { i32, [8 x ptr], ptr } + +define void @foo(ptr addrspace(4) byref(%anon) align 8 %0) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(4) [[TMP0:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_COND10:%.*]] +; CHECK: for.cond10: +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +; CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(1) null, align 8 +; CHECK-NEXT: br label [[FOR_COND10]] +; +entry: + %coerce = alloca %anon, addrspace(5) + call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %coerce, ptr addrspace(4) %0, i64 0, i1 false) + %asc = addrspacecast ptr addrspace(5) %coerce to ptr + %load = load ptr, ptr addrspace(5) %coerce + %retval.0.i = select i1 false, ptr %asc, ptr %load + br label %for.cond10 + +for.cond10: ; preds = %for.cond10, %entry + %3 = load i64, ptr %retval.0.i + store i64 %3, ptr addrspace(1) null + br label %for.cond10 +} + +declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5), ptr addrspace(4), i64, i1 immarg) diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll index 16a5f8fbf13101..99e1dd45286974 100644 --- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll +++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll @@ -331,4 +331,19 @@ define i1 @and_is_constant(ptr %arg, ptr %arg2) { ret i1 %and } +define i1 @pr98753(i32 noundef %x, i32 %y) { +; CHECK-LABEL: @pr98753( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i32 [[Y:%.*]], i32 undef +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[SEL]], 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: ret i1 [[AND]] +; + %cmp1 = icmp ne i32 %x, 0 + %sel = select i1 %cmp1, i32 %y, i32 undef + %cmp2 = icmp sgt i32 %sel, 0 + %and = and i1 %cmp1, %cmp2 + ret i1 %and +} + declare i1 @llvm.is.constant.i1(i1) diff --git a/llvm/test/Transforms/InstSimplify/ptrtoint.ll b/llvm/test/Transforms/InstSimplify/ptrtoint.ll index 55a5a0d452f10a..734618713c342e 100644 --- a/llvm/test/Transforms/InstSimplify/ptrtoint.ll +++ b/llvm/test/Transforms/InstSimplify/ptrtoint.ll @@ -4,11 +4,7 @@ define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) { ; CHECK-LABEL: define i64 @ptrtoint_gep_sub( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[END_ADDR:%.*]]) { -; CHECK-NEXT: [[PTR_ADDR:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CHECK-NEXT: [[SIZE:%.*]] = sub i64 [[END_ADDR]], [[PTR_ADDR]] -; CHECK-NEXT: [[END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[SIZE]] -; CHECK-NEXT: [[END_ADDR2:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEXT: ret i64 [[END_ADDR2]] +; CHECK-NEXT: ret i64 [[END_ADDR]] ; %ptr.addr = ptrtoint ptr %ptr to i64 %size = sub i64 %end.addr, %ptr.addr @@ -20,11 +16,7 @@ define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) { define <2 x i64> @ptrtoint_gep_sub_vector(<2 x ptr> %ptr, <2 x i64> %end.addr) { ; CHECK-LABEL: define <2 x i64> @ptrtoint_gep_sub_vector( ; CHECK-SAME: <2 x ptr> [[PTR:%.*]], <2 x i64> [[END_ADDR:%.*]]) { -; CHECK-NEXT: [[PTR_ADDR:%.*]] = ptrtoint <2 x ptr> [[PTR]] to <2 x i64> -; CHECK-NEXT: [[SIZE:%.*]] = sub <2 x i64> [[END_ADDR]], [[PTR_ADDR]] -; CHECK-NEXT: [[END:%.*]] = getelementptr i8, <2 x ptr> [[PTR]], <2 x i64> [[SIZE]] -; CHECK-NEXT: [[END_ADDR2:%.*]] = ptrtoint <2 x ptr> [[END]] to <2 x i64> -; CHECK-NEXT: ret <2 x i64> [[END_ADDR2]] +; CHECK-NEXT: ret <2 x i64> [[END_ADDR]] ; %ptr.addr = ptrtoint <2 x ptr> %ptr to <2 x i64> %size = sub <2 x i64> %end.addr, %ptr.addr diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll new file mode 100644 index 00000000000000..5a182745399bee --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll @@ -0,0 +1,809 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-idiom -mtriple=aarch64 < %s -S | FileCheck %s + +; Recognize CTLZ builtin pattern. +; Here we'll just convert loop to countable, +; so do not insert builtin if CPU do not support CTLZ +; +; int ctlz_and_other(int n, char *a) +; { +; n = n >= 0 ? n : -n; +; int i = 0, n0 = n; +; while(n >>= 1) { +; a[i] = (n0 & (1 << i)) ? 1 : 0; +; i++; +; } +; return i; +; } +; + +; Function Attrs: norecurse nounwind uwtable +define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) { +; CHECK-LABEL: define i32 @ctlz_and_other( +; CHECK-SAME: i32 [[N:%.*]], ptr nocapture [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[ABS_N]], 1 +; CHECK-NEXT: [[TOBOOL9:%.*]] = icmp eq i32 [[SHR8]], 0 +; CHECK-NEXT: br i1 [[TOBOOL9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[SHR8]], i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR11:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[SHR8]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, [[TMP3]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHL]], [[ABS_N]] +; CHECK-NEXT: [[TOBOOL1:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL1]] to i8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[CONV]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[SHR]] = ashr i32 [[SHR11]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INDVARS_IV_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV_NEXT_LCSSA]] to i32 +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true) + %shr8 = lshr i32 %abs_n, 1 + %tobool9 = icmp eq i32 %shr8, 0 + br i1 %tobool9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ] + %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ] + %0 = trunc i64 %indvars.iv to i32 + %shl = shl i32 1, %0 + %and = and i32 %shl, %abs_n + %tobool1 = icmp ne i32 %and, 0 + %conv = zext i1 %tobool1 to i8 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + store i8 %conv, ptr %arrayidx, align 1 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %shr = ashr i32 %shr11, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %1 = trunc i64 %indvars.iv.next to i32 + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_zero_check(int n) +; { +; n = n >= 0 ? n : -n; +; int i = 0; +; while(n) { +; n >>= 1; +; i++; +; } +; return i; +; } +; + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_zero_check(i32 %n) { +; CHECK-LABEL: define i32 @ctlz_zero_check( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TOBOOL4:%.*]] = icmp eq i32 [[ABS_N]], 0 +; CHECK-NEXT: br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[ABS_N]], i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_ADDR_05:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[ABS_N]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = ashr i32 [[N_ADDR_05]], 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_06]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true) + %tobool4 = icmp eq i32 %abs_n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ] + %shr = ashr i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz(int n) +; { +; n = n >= 0 ? n : -n; +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz(i32 %n) { +; CHECK-LABEL: define i32 @ctlz( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[SHR]] = ashr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true) + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; This test covers how instcombine may optimise the previous ctlz case. +; +; int ctlz(int n) +; { +; n = n >= 0 ? n : -n; +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } + +define i32 @ctlz_fold(i32 noundef %n) { +; CHECK-LABEL: define i32 @ctlz_fold( +; CHECK-SAME: i32 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = tail call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TOBOOL_NOT5:%.*]] = icmp ult i32 [[COND]], 2 +; CHECK-NEXT: br i1 [[TOBOOL_NOT5]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[COND]], i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP2]], 1 +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_ADDR_06:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[COND]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = lshr i32 [[N_ADDR_06]], 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %cond = tail call i32 @llvm.abs.i32(i32 %n, i1 true) + %tobool.not5 = icmp ult i32 %cond, 2 + br i1 %tobool.not5, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.06 = phi i32 [ %shr, %while.body ], [ %cond, %while.body.preheader ] + %shr = lshr i32 %n.addr.06, 1 + %inc = add nuw nsw i32 %i.07, 1 + %tobool.not = icmp ult i32 %n.addr.06, 4 + br i1 %tobool.not, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_add(int n, int i0) +; { +; n = n >= 0 ? n : -n; +; int i = i0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_add(i32 %n, i32 %i0) { +; CHECK-LABEL: define i32 @ctlz_add( +; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[I0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[SHR]] = ashr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true) + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_sub(int n, int i0) +; { +; n = n >= 0 ? n : -n; +; int i = i0; +; while(n >>= 1) { +; i--; +; } +; return i; +; } +; +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_sub(i32 %n, i32 %i0) { +; CHECK-LABEL: define i32 @ctlz_sub( +; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[I0]], [[TMP2]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[SHR]] = ashr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], -1 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true) + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, -1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_sext(short in) +; { +; int n = in; +; if (in < 0) +; n = -n; +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_sext(i16 %in) { +; CHECK-LABEL: define i32 @ctlz_sext( +; CHECK-SAME: i16 [[IN:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ABS:%.*]] = call i16 @llvm.abs.i16(i16 [[IN]], i1 false) +; CHECK-NEXT: [[ABS_N:%.*]] = zext i16 [[ABS]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[SHR]] = ashr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %abs = call i16 @llvm.abs.i16(i16 %in, i1 false) + %abs_n = zext i16 %abs to i32 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + + +; unsigned floor_log2(unsigned long n) { +; unsigned result = 0; +; while (n >>= 1) result++; +; return result; +; } + +define i32 @floor_log2_use_inc(i64 noundef %n) { +; CHECK-LABEL: define i32 @floor_log2_use_inc( +; CHECK-SAME: i64 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 64, [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i64 [ [[TMP4]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = lshr i64 [[N_ADDR_03]], 1 +; CHECK-NEXT: [[INC]] = add i32 [[RESULT_04]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i64 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] +; +entry: + %tobool.not2 = icmp ult i64 %n, 2 + br i1 %tobool.not2, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = lshr i64 %n.addr.03, 1 + %inc = add i32 %result.04, 1 + %tobool.not = icmp ult i64 %n.addr.03, 4 + br i1 %tobool.not, label %while.end.loopexit, label %while.body + +while.end.loopexit: + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %while.end + +while.end: + %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] + ret i32 %result.0.lcssa +} + + +define i32 @floor_log2_use_phi(i64 noundef %n) { +; CHECK-LABEL: define i32 @floor_log2_use_phi( +; CHECK-SAME: i64 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = lshr i64 [[N_ADDR_03]], 1 +; CHECK-NEXT: [[INC]] = add i32 [[RESULT_04]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ult i64 [[N_ADDR_03]], 4 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[RESULT_04]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] +; +entry: + %tobool.not2 = icmp ult i64 %n, 2 + br i1 %tobool.not2, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = lshr i64 %n.addr.03, 1 + %inc = add i32 %result.04, 1 + %tobool.not = icmp ult i64 %n.addr.03, 4 + br i1 %tobool.not, label %while.end.loopexit, label %while.body + +while.end.loopexit: + %inc.lcssa = phi i32 [ %result.04, %while.body ] + br label %while.end + +while.end: + %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] + ret i32 %result.0.lcssa +} + + +; unsigned floor_log2_dec(unsigned long n) { +; unsigned result = 0; +; while (n >>= 1) result--; +; return result; +; } + +define i32 @floor_log2_dec(i64 noundef %n) { +; CHECK-LABEL: define i32 @floor_log2_dec( +; CHECK-SAME: i64 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 64, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = lshr i64 [[N_ADDR_03]], 1 +; CHECK-NEXT: [[INC]] = add i32 [[RESULT_04]], -1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i64 [[TCPHI]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] +; +entry: + %tobool.not2 = icmp ult i64 %n, 2 + br i1 %tobool.not2, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = lshr i64 %n.addr.03, 1 + %inc = add i32 %result.04, -1 + %tobool.not = icmp ult i64 %n.addr.03, 4 + br i1 %tobool.not, label %while.end.loopexit, label %while.body + +while.end.loopexit: + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %while.end + +while.end: + %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] + ret i32 %result.0.lcssa +} + + +; unsigned int_log2_rec(unsigned x) { +; return x == 0 ? 0 : int_log2_rec(x >> 1) + 1; +; } + +define i32 @int_log2_rec(i32 noundef %x) { +; CHECK-LABEL: define i32 @int_log2_rec( +; CHECK-SAME: i32 noundef [[X:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: br i1 [[CMP2]], label [[COND_END:%.*]], label [[COND_FALSE_PREHEADER:%.*]] +; CHECK: cond.false.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] +; CHECK-NEXT: br label [[COND_FALSE:%.*]] +; CHECK: cond.false: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE_PREHEADER]] ], [ [[TCDEC:%.*]], [[COND_FALSE]] ] +; CHECK-NEXT: [[X_TR4:%.*]] = phi i32 [ [[SHR:%.*]], [[COND_FALSE]] ], [ [[X]], [[COND_FALSE_PREHEADER]] ] +; CHECK-NEXT: [[ACCUMULATOR_TR3:%.*]] = phi i32 [ [[ADD:%.*]], [[COND_FALSE]] ], [ 0, [[COND_FALSE_PREHEADER]] ] +; CHECK-NEXT: [[SHR]] = lshr i32 [[X_TR4]], 1 +; CHECK-NEXT: [[ADD]] = add i32 [[ACCUMULATOR_TR3]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_END_LOOPEXIT:%.*]], label [[COND_FALSE]] +; CHECK: cond.end.loopexit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE]] ] +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[ACCUMULATOR_TR_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[COND_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[ACCUMULATOR_TR_LCSSA]] +; +entry: + %cmp2 = icmp eq i32 %x, 0 + br i1 %cmp2, label %cond.end, label %cond.false.preheader + +cond.false.preheader: ; preds = %entry + br label %cond.false + +cond.false: ; preds = %cond.false.preheader, %cond.false + %x.tr4 = phi i32 [ %shr, %cond.false ], [ %x, %cond.false.preheader ] + %accumulator.tr3 = phi i32 [ %add, %cond.false ], [ 0, %cond.false.preheader ] + %shr = lshr i32 %x.tr4, 1 + %add = add i32 %accumulator.tr3, 1 + %cmp = icmp ult i32 %x.tr4, 2 + br i1 %cmp, label %cond.end.loopexit, label %cond.false + +cond.end.loopexit: ; preds = %cond.false + %add.lcssa = phi i32 [ %add, %cond.false ] + br label %cond.end + +cond.end: ; preds = %cond.end.loopexit, %entry + %accumulator.tr.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %cond.end.loopexit ] + ret i32 %accumulator.tr.lcssa +} + + +; We can't easily transform this loop. It returns 1 for an input of both +; 0 and 1. +; int ctlz_do_while_use_inc(unsigned n) +; { +; int i = 0; +; do { +; i++; +; n >>= 1; +; } while(n != 0); +; return i; +; } + +define i32 @ctlz_do_while_use_inc(i32 noundef %n) { +; CHECK-LABEL: define i32 @ctlz_do_while_use_inc( +; CHECK-SAME: i32 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 +; CHECK-NEXT: [[SHR]] = lshr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ult i32 [[N_ADDR_0]], 2 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]] +; CHECK: do.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[DO_BODY]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ] + %inc = add nuw nsw i32 %i.0, 1 + %shr = lshr i32 %n.addr.0, 1 + %cmp.not = icmp ult i32 %n.addr.0, 2 + br i1 %cmp.not, label %do.end, label %do.body + +do.end: ; preds = %do.body + %inc.lcssa = phi i32 [ %inc, %do.body ] + ret i32 %inc.lcssa +} + + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_do_while_use_phi(unsigned n) +; { +; int phi; +; int inc = 0; +; do { +; phi = inc; +; inc++; +; n >>= 1; +; } while(n != 0); +; return phi; +; } + +define i32 @ctlz_do_while_use_phi(i32 noundef %n) { +; CHECK-LABEL: define i32 @ctlz_do_while_use_phi( +; CHECK-SAME: i32 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 32, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[SHR:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[INC_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[INC1]] = add nuw nsw i32 [[INC_0]], 1 +; CHECK-NEXT: [[SHR]] = lshr i32 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]] +; CHECK: do.end: +; CHECK-NEXT: [[INC_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[DO_BODY]] ] +; CHECK-NEXT: ret i32 [[INC_0_LCSSA]] +; +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ] + %inc.0 = phi i32 [ 0, %entry ], [ %inc1, %do.body ] + %inc1 = add nuw nsw i32 %inc.0, 1 + %shr = lshr i32 %n.addr.0, 1 + %cmp.not = icmp ult i32 %n.addr.0, 2 + br i1 %cmp.not, label %do.end, label %do.body + +do.end: ; preds = %do.body + ret i32 %inc.0 +} + +; Check that we correctly bail on analysis when the ult comparison is with a +; constant that exceeds the (unsigned) range of a 64-bit integer, as we currently +; only handle loopback condition ult 2 or 4. + +define i128 @large_constant(i128 noundef %n) { +; CHECK-LABEL: define i128 @large_constant( +; CHECK-SAME: i128 noundef [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i128 [ [[N]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[SHR]] = lshr i128 [[N_ADDR_0]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ult i128 [[N_ADDR_0]], 18446744073709551616 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]] +; CHECK: do.end: +; CHECK-NEXT: [[SHR_LCSSA:%.*]] = phi i128 [ [[SHR]], [[DO_BODY]] ] +; CHECK-NEXT: ret i128 [[SHR_LCSSA]] +; +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %n.addr.0 = phi i128 [ %n, %entry ], [ %shr, %do.body ] + %shr = lshr i128 %n.addr.0, 1 + %cmp.not = icmp ult i128 %n.addr.0, 18446744073709551616 + br i1 %cmp.not, label %do.end, label %do.body + +do.end: ; preds = %do.body + ret i128 %shr +} + + +declare i32 @llvm.abs.i32(i32, i1) +declare i16 @llvm.abs.i16(i16, i1) diff --git a/llvm/test/Transforms/LoopRotate/minsize-disable.ll b/llvm/test/Transforms/LoopRotate/minsize-disable.ll new file mode 100644 index 00000000000000..2db87b3ce8291b --- /dev/null +++ b/llvm/test/Transforms/LoopRotate/minsize-disable.ll @@ -0,0 +1,32 @@ +; REQUIRES: asserts +; RUN: opt < %s -S -passes=loop-rotate -debug -debug-only=loop-rotate 2>&1 | FileCheck %s + +; Loop should not be rotated for functions with the minsize attribute. +; This is mostly useful for LTO which doesn't (yet) understand -Oz. +; CHECK: LoopRotation: NOT rotating - contains 2 instructions, which is more + +@e = global i32 10 + +declare void @use(i32) + +; Function attrs: minsize optsize +define void @test() #0 { +entry: + %end = load i32, ptr @e + br label %loop + +loop: + %n.phi = phi i32 [ %n, %loop.fin ], [ 0, %entry ] + %cond = icmp eq i32 %n.phi, %end + br i1 %cond, label %exit, label %loop.fin + +loop.fin: + %n = add i32 %n.phi, 1 + call void @use(i32 %n) + br label %loop + +exit: + ret void +} + +attributes #0 = { minsize optsize } diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll index 1ab19081b7de59..7353acd7228cdc 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll @@ -20,7 +20,6 @@ define ptr @foo(ptr %a0, ptr %a1, i64 %a2) { ; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: .LBB0_3: # %do.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vse8.v v8, (a3) ; CHECK-NEXT: add a3, a3, a4 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/preserving-debugloc-phi-binop.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/preserving-debugloc-phi-binop.ll new file mode 100644 index 00000000000000..1aecd34082a2a5 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/preserving-debugloc-phi-binop.ll @@ -0,0 +1,59 @@ +; RUN: opt -S -passes=loop-reduce -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; Check that LoopStrengthReduce's OptimizeShadowIV() propagates the debug +; locations of the old phi (`%accum`) and binop (`%accum.next`) instruction +; to the new phi and binop instruction, respectively. + +target datalayout = "n8:16:32:64" + +define i32 @foobar6() !dbg !5 { +; CHECK-LABEL: define i32 @foobar6( +; CHECK: loop: +; CHECK: [[IV_S_:%.*]] = phi double [ -3.220000e+03, %[[ENTRY:.*]] ], [ [[IV_S_NEXT_:%.*]], %loop ], !dbg [[DBG9:![0-9]+]] +; CHECK: [[IV_S_NEXT_]] = fadd double [[IV_S_]], 0x41624E65A0000000, !dbg [[DBG11:![0-9]+]] +; CHECK: exit: +; +entry: + br label %loop, !dbg !8 + +loop: ; preds = %loop, %entry + %accum = phi i32 [ -3220, %entry ], [ %accum.next, %loop ], !dbg !9 + %iv = phi i32 [ 12, %entry ], [ %iv.next, %loop ], !dbg !10 + %tmp1 = sitofp i32 %accum to double, !dbg !11 + tail call void @foo(double %tmp1), !dbg !12 + %accum.next = add nsw i32 %accum, 9597741, !dbg !13 + %iv.next = add nuw nsw i32 %iv, 1, !dbg !14 + %exitcond = icmp ugt i32 %iv, 235, !dbg !15 + br i1 %exitcond, label %exit, label %loop, !dbg !16 + +exit: ; preds = %loop + ret i32 %accum.next, !dbg !17 +} + +declare void @foo(double) + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +; CHECK: [[DBG9]] = !DILocation(line: 2, +; CHECK: [[DBG11]] = !DILocation(line: 6, + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "main.ll", directory: "/") +!2 = !{i32 10} +!3 = !{i32 0} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "foobar6", linkageName: "foobar6", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +!9 = !DILocation(line: 2, column: 1, scope: !5) +!10 = !DILocation(line: 3, column: 1, scope: !5) +!11 = !DILocation(line: 4, column: 1, scope: !5) +!12 = !DILocation(line: 5, column: 1, scope: !5) +!13 = !DILocation(line: 6, column: 1, scope: !5) +!14 = !DILocation(line: 7, column: 1, scope: !5) +!15 = !DILocation(line: 8, column: 1, scope: !5) +!16 = !DILocation(line: 9, column: 1, scope: !5) +!17 = !DILocation(line: 10, column: 1, scope: !5) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll index 1cde8b9bad6fc2..2bcc93127da1e0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll @@ -1,10 +1,11 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) { +; CHECK: LV: Checking a loop in 'selects_1' ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6 @@ -12,17 +13,14 @@ define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) { ; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and ; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and ; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6 - -; CHECK-LABEL: define void @selects_1( -; CHECK: vector.body: -; CHECK: select <4 x i1> +; CHECK: LV: Selecting VF: 4 entry: %cmp26 = icmp sgt i32 %N, 0 br i1 %cmp26, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry - %wide.trip.count = zext i32 %N to i64 + %n = zext i32 %N to i64 br label %for.body for.body: ; preds = %for.body.preheader, %for.body @@ -38,7 +36,7 @@ for.body: ; preds = %for.body.preheader, %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6 store i32 %cond11, ptr %arrayidx, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + %exitcond.not = icmp eq i64 %indvars.iv.next, %n br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body for.cond.cleanup.loopexit: ; preds = %for.body @@ -47,3 +45,31 @@ for.cond.cleanup.loopexit: ; preds = %for.body for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry ret void } + +define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) { +; CHECK: LV: Checking a loop in 'multi_user_cmp' +; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %cmp1 = fcmp olt float %load1, 0.000000e+00 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 +; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %all.off = select i1 %cmp1, i1 %all.off.next, i1 false +; CHECK: LV: Selecting VF: 16. +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.off.next = phi i1 [ true, %entry ], [ %all.off, %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.off = select i1 %cmp1, i1 %all.off.next, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.off, i32 1, i32 %0 + ret i32 %1 +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll index 3217f508f0adce..812af1a102083f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -9,9 +9,14 @@ target triple = "aarch64-unknown-linux-gnu" ; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i7 %indvars.iv1294, 1 define void @induction_i7(ptr %dst) #0 { -; CHECK-LABEL: @induction_i7( +; CHECK-LABEL: define void @induction_i7( +; CHECK-SAME: ptr [[DST:%.*]]) ; CHECK: vector.ph: -; CHECK: %ind.end = trunc i64 %n.vec to i7 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() @@ -19,10 +24,16 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i7 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = call i7 @llvm.vscale.i7() +; CHECK-NEXT: [[TMP11:%.*]] = mul i7 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = mul i7 1, [[TMP11]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i7 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 @@ -31,8 +42,8 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = add [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = add [[STEP_ADD]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP23:%.*]] = zext [[TMP19]] to ; CHECK-NEXT: [[TMP24:%.*]] = zext [[TMP20]] to ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0 @@ -43,6 +54,9 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK-NEXT: store [[TMP24]], ptr [[TMP28]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label %middle.block, label %[[VECTOR_BODY]] +; entry: br label %for.body @@ -69,18 +83,30 @@ for.end: ; preds = %for.body ; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i3 %indvars.iv1294, 1 define void @induction_i3_zext(ptr %dst) #0 { -; CHECK-LABEL: @induction_i3_zext( +; CHECK-LABEL: define void @induction_i3_zext( +; CHECK-SAME: ptr [[DST:%.*]]) ; CHECK: vector.ph: -; CHECK: %ind.end = trunc i64 %n.vec to i3 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i3 1, i64 0), poison, zeroinitializer) -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = call i3 @llvm.vscale.i3() +; CHECK-NEXT: [[TMP11:%.*]] = mul i3 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = mul i3 1, [[TMP11]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i3 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() @@ -100,6 +126,9 @@ define void @induction_i3_zext(ptr %dst) #0 { ; CHECK-NEXT: store [[TMP20]], ptr [[TMP26]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label %middle.block, label %[[VECTOR_BODY]] +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index 8b64d7a083662e..071d518599caca 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -27,26 +27,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_1]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 8, [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP13]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP18:%.*]] = add [[DOTSPLAT]], [[TMP17]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP18]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector [[DOTSPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP22:%.*]] = add [[DOTSPLAT6]], [[TMP21]] -; CHECK-NEXT: [[VECTOR_GEP7:%.*]] = mul [[TMP22]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP7]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() @@ -63,7 +44,6 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 8 ; CHECK-NEXT: store zeroinitializer, ptr [[TMP35]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll index bd52c2a8f06452..7f258d57e7018b 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll @@ -8,9 +8,9 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF * UF -; CHECK-NEXT: Live-in vp<%1> = vector-trip-count -; CHECK-NEXT: Live-in vp<%2> = backedge-taken count +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-NEXT: Live-in ir<%N> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -18,37 +18,37 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.*]]> ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> -; CHECK-NEXT: EMIT vp<%4> = icmp ule ir<%iv>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%4> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[CMP]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1> -; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%5> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx> -; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%5> +; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2> -; CHECK-NEXT: REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%5> +; CHECK-NEXT: REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE ir<%add> = add nsw ir<%1>, ir<%0> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx4> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0> -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P1:%.+]]> = ir<%0> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P2:%.+]]> = ir<%1> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): for.body.2 ; CHECK-EMPTY: ; CHECK-NEXT: for.body.2: -; CHECK-NEXT: EMIT vp<%8> = add vp<%3>, vp<%0> -; CHECK-NEXT: EMIT branch-on-count vp<%8>, vp<%1> +; CHECK-NEXT: EMIT vp<[[CAN_INC:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -74,8 +74,8 @@ for.cond.cleanup: define void @safe_dep(ptr %p) { ; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF * UF -; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in ir<512> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -83,17 +83,17 @@ define void @safe_dep(ptr %p) { ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%6> -; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> -; CHECK-NEXT: CLONE ir<%a1> = getelementptr ir<%p>, vp<%3> -; CHECK-NEXT: vp<%4> = vector-pointer ir<%a1> -; CHECK-NEXT: WIDEN ir<%v> = load vp<%4> -; CHECK-NEXT: CLONE ir<%offset> = add vp<%3>, ir<100> +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%a1> = getelementptr ir<%p>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VPTR1:%.+]]> = vector-pointer ir<%a1> +; CHECK-NEXT: WIDEN ir<%v> = load vp<[[VPTR1]]> +; CHECK-NEXT: CLONE ir<%offset> = add vp<[[STEPS]]>, ir<100> ; CHECK-NEXT: CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset> -; CHECK-NEXT: vp<%5> = vector-pointer ir<%a2> -; CHECK-NEXT: WIDEN store vp<%5>, ir<%v> -; CHECK-NEXT: EMIT vp<%6> = add nuw vp<%2>, vp<%0> -; CHECK-NEXT: EMIT branch-on-count vp<%6>, vp<%1> +; CHECK-NEXT: vp<[[VPTR2:%.+]]> = vector-pointer ir<%a2> +; CHECK-NEXT: WIDEN store vp<[[VPTR2]]>, ir<%v> +; CHECK-NEXT: EMIT vp<[[CAN_INC]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 4e26eb38f21a9d..8824fa8a16b748 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP -; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s +; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-OUTLOOP %s +; RUN: opt -passes=loop-vectorize -prefer-inloop-reductions -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-INLOOP %s + target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" -; FIXME: inloop reductions are not supported yet with predicated vectorization. - define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; OUTLOOP-LABEL: @add_i16_i32( ; OUTLOOP-NEXT: entry: @@ -117,67 +117,84 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] ; INLOOP-NEXT: ret i32 [[R_0_LCSSA]] ; -; IF-EVL-LABEL: @add_i16_i32( -; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; IF-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; IF-EVL: for.body.preheader: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 -; IF-EVL-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]] -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; IF-EVL-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] -; IF-EVL: vector.body: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[INDEX]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP7]] -; IF-EVL-NEXT: [[TMP8:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP5]] -; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i16.p0(ptr [[TMP10]], i32 2, [[TMP8]], poison) -; IF-EVL-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_LOAD]] to -; IF-EVL-NEXT: [[TMP12]] = add [[VEC_PHI]], [[TMP11]] -; IF-EVL-NEXT: [[TMP13:%.*]] = select [[TMP8]], [[TMP12]], [[VEC_PHI]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; IF-EVL: middle.block: -; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] -; IF-EVL: for.body: -; IF-EVL-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IF-EVL-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; IF-EVL-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; IF-EVL-NEXT: [[CONV:%.*]] = sext i16 [[TMP16]] to i32 -; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] -; IF-EVL-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 -; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; IF-EVL: for.cond.cleanup.loopexit: -; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] -; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP]] -; IF-EVL: for.cond.cleanup: -; IF-EVL-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] -; IF-EVL-NEXT: ret i32 [[R_0_LCSSA]] +; IF-EVL-OUTLOOP-LABEL: @add_i16_i32( +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-EVL-OUTLOOP: for.body.preheader: +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[I_08]] +; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] +; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit: +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-EVL-OUTLOOP: for.cond.cleanup: +; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; IF-EVL-OUTLOOP-NEXT: ret i32 [[R_0_LCSSA]] +; +; IF-EVL-INLOOP-LABEL: @add_i16_i32( +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-EVL-INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-EVL-INLOOP: for.body.preheader: +; IF-EVL-INLOOP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8 +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 8 +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]] +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sext [[VP_OP_LOAD]] to +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP6]]) +; IF-EVL-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; IF-EVL-INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] +; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; IF-EVL-INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] +; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-INLOOP: for.cond.cleanup.loopexit: +; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-EVL-INLOOP: for.cond.cleanup: +; IF-EVL-INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; IF-EVL-INLOOP-NEXT: ret i32 [[R_0_LCSSA]] ; entry: %cmp6 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll new file mode 100644 index 00000000000000..8bde5ba5f15193 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -0,0 +1,891 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP + +define i32 @cond_add(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add( +; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP27]], 3 +; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP27]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP: for.end: +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; IF-EVL-INLOOP-LABEL: define i32 @cond_add( +; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[VP_OP_LOAD]], zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] +; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3 +; IF-EVL-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP25]], i32 0 +; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-INLOOP: for.end: +; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-OUTLOOP-LABEL: define i32 @cond_add( +; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-OUTLOOP-NEXT: entry: +; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]] +; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-OUTLOOP: vector.body: +; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP14]], align 4 +; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = select [[TMP21]], [[WIDE_MASKED_LOAD]], zeroinitializer +; NO-VP-OUTLOOP-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] +; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP-OUTLOOP: middle.block: +; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) +; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-OUTLOOP: scalar.ph: +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-OUTLOOP: for.body: +; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3 +; NO-VP-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP15]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-OUTLOOP: for.end: +; NO-VP-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; NO-VP-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-INLOOP-LABEL: define i32 @cond_add( +; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-INLOOP-NEXT: entry: +; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-INLOOP: vector.body: +; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] +; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = select [[TMP9]], [[WIDE_LOAD]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) +; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP-INLOOP: middle.block: +; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-INLOOP: scalar.ph: +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-INLOOP: for.body: +; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 3 +; NO-VP-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP14]], i32 0 +; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-INLOOP: for.end: +; NO-VP-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp = icmp sgt i32 %0, 3 + %select = select i1 %cmp, i32 %0, i32 0 + %add = add nsw i32 %select, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %add +} + +define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add_pred( +; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP27:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], 3 +; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP: if.then: +; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP28]] +; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP: for.inc: +; IF-EVL-OUTLOOP-NEXT: [[TMP27]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP: for.end: +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; IF-EVL-INLOOP-LABEL: define i32 @cond_add_pred( +; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] +; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] +; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], zeroinitializer +; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], [[TMP20]], i32 [[TMP12]]) +; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] +; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3 +; IF-EVL-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; IF-EVL-INLOOP: if.then: +; IF-EVL-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP25]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_INC]] +; IF-EVL-INLOOP: for.inc: +; IF-EVL-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-INLOOP: for.end: +; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; NO-VP-OUTLOOP-LABEL: define i32 @cond_add_pred( +; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-OUTLOOP-NEXT: entry: +; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]] +; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-OUTLOOP: vector.body: +; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP21]], align 4 +; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = xor [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP17]], [[VEC_PHI]], [[TMP16]] +; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP-OUTLOOP: middle.block: +; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PREDPHI]]) +; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-OUTLOOP: scalar.ph: +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-OUTLOOP: for.body: +; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] +; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3 +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; NO-VP-OUTLOOP: if.then: +; NO-VP-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP15]] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_INC]] +; NO-VP-OUTLOOP: for.inc: +; NO-VP-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP-OUTLOOP: for.end: +; NO-VP-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; NO-VP-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; NO-VP-INLOOP-LABEL: define i32 @cond_add_pred( +; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-INLOOP-NEXT: entry: +; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-INLOOP: vector.body: +; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] +; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = select [[TMP9]], [[WIDE_LOAD]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) +; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP-INLOOP: middle.block: +; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-INLOOP: scalar.ph: +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-INLOOP: for.body: +; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] +; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 3 +; NO-VP-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; NO-VP-INLOOP: if.then: +; NO-VP-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP14]] +; NO-VP-INLOOP-NEXT: br label [[FOR_INC]] +; NO-VP-INLOOP: for.inc: +; NO-VP-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP-INLOOP: for.end: +; NO-VP-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp = icmp sgt i32 %0, 3 + br i1 %cmp, label %if.then, label %for.inc + +if.then: + %add.pred = add nsw i32 %rdx, %0 + br label %for.inc + +for.inc: + %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %rdx.add +} + +define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add( +; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP37]], [[IV_TRUNC]] +; IF-EVL-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP37]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP: for.end: +; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add( +; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP28]], [[IV_TRUNC]] +; IF-EVL-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP28]], i32 0 +; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP: for.end: +; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add( +; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-OUTLOOP-NEXT: entry: +; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]] +; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer +; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] +; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]] +; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-OUTLOOP: vector.body: +; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP20]], align 4 +; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], [[VEC_IND]] +; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = select [[TMP27]], [[WIDE_MASKED_LOAD]], zeroinitializer +; NO-VP-OUTLOOP-NEXT: [[TMP23]] = add [[TMP22]], [[VEC_PHI]] +; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; NO-VP-OUTLOOP-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-OUTLOOP: middle.block: +; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP23]]) +; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-OUTLOOP: scalar.ph: +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-OUTLOOP: for.body: +; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP21]], [[IV_TRUNC]] +; NO-VP-OUTLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP21]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP-OUTLOOP: for.end: +; NO-VP-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; NO-VP-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-INLOOP-LABEL: define i32 @step_cond_add( +; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-INLOOP-NEXT: entry: +; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]] +; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-INLOOP: vector.body: +; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 +; NO-VP-INLOOP-NEXT: [[TMP15:%.*]] = icmp sgt [[WIDE_LOAD]], [[VEC_IND]] +; NO-VP-INLOOP-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[WIDE_LOAD]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; NO-VP-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]] +; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-INLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; NO-VP-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-INLOOP: middle.block: +; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-INLOOP: scalar.ph: +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-INLOOP: for.body: +; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-INLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP20]], [[IV_TRUNC]] +; NO-VP-INLOOP-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP20]], i32 0 +; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] +; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP-INLOOP: for.end: +; NO-VP-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; NO-VP-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %iv.trunc = trunc i64 %iv to i32 + %cmp = icmp sgt i32 %0, %iv.trunc + %select = select i1 %cmp, i32 %0, i32 0 + %add = add nsw i32 %select, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %add +} + +define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP-LABEL: define i32 @step_cond_add_pred( +; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP37:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; IF-EVL-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP38]], [[IV_TRUNC]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP: if.then: +; IF-EVL-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP38]] +; IF-EVL-OUTLOOP-NEXT: br label [[MIDDLE_BLOCK]] +; IF-EVL-OUTLOOP: for.inc: +; IF-EVL-OUTLOOP-NEXT: [[TMP37]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL-OUTLOOP: for.end: +; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add_pred( +; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP32:%.*]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-INLOOP-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; IF-EVL-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP35]], [[IV_TRUNC]] +; IF-EVL-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]] +; IF-EVL-INLOOP: if.then: +; IF-EVL-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP35]] +; IF-EVL-INLOOP-NEXT: br label [[MIDDLE_BLOCK]] +; IF-EVL-INLOOP: for.inc: +; IF-EVL-INLOOP-NEXT: [[TMP32]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6]] +; IF-EVL-INLOOP: for.end: +; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ] +; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add_pred( +; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-OUTLOOP-NEXT: entry: +; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]] +; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer +; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] +; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]] +; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-OUTLOOP: vector.body: +; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load , ptr [[TMP27]], align 4 +; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], [[VEC_IND]] +; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; NO-VP-OUTLOOP-NEXT: [[TMP23:%.*]] = xor [[TMP28]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select [[TMP23]], [[VEC_PHI]], [[TMP22]] +; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP-OUTLOOP: middle.block: +; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PREDPHI]]) +; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-OUTLOOP: scalar.ph: +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-OUTLOOP: for.body: +; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] +; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-OUTLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; NO-VP-OUTLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP21]], [[IV_TRUNC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; NO-VP-OUTLOOP: if.then: +; NO-VP-OUTLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP21]] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_INC]] +; NO-VP-OUTLOOP: for.inc: +; NO-VP-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NO-VP-OUTLOOP: for.end: +; NO-VP-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; NO-VP-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +; NO-VP-INLOOP-LABEL: define i32 @step_cond_add_pred( +; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] { +; NO-VP-INLOOP-NEXT: entry: +; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]] +; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-INLOOP: vector.body: +; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 +; NO-VP-INLOOP-NEXT: [[TMP15:%.*]] = icmp sgt [[WIDE_LOAD]], [[VEC_IND]] +; NO-VP-INLOOP-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[WIDE_LOAD]], zeroinitializer +; NO-VP-INLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; NO-VP-INLOOP-NEXT: [[TMP18]] = add i32 [[TMP17]], [[VEC_PHI]] +; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-INLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; NO-VP-INLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP-INLOOP: middle.block: +; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-INLOOP: scalar.ph: +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-INLOOP: for.body: +; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ] +; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-INLOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-INLOOP-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; NO-VP-INLOOP-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP20]], [[IV_TRUNC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; NO-VP-INLOOP: if.then: +; NO-VP-INLOOP-NEXT: [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP20]] +; NO-VP-INLOOP-NEXT: br label [[FOR_INC]] +; NO-VP-INLOOP: for.inc: +; NO-VP-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NO-VP-INLOOP: for.end: +; NO-VP-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; NO-VP-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.add, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %iv.trunc = trunc i64 %iv to i32 + %cmp = icmp sgt i32 %0, %iv.trunc + br i1 %cmp, label %if.then, label %for.inc + +if.then: + %add.pred = add nsw i32 %rdx, %0 + br label %for.inc + +for.inc: + %rdx.add = phi i32 [ %add.pred, %if.then ], [ %rdx, %for.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %rdx.add +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IF-EVL-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]]} +; IF-EVL-INLOOP: [[META7]] = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; NO-VP-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-VP-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP-OUTLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; NO-VP-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; NO-VP-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; NO-VP-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; NO-VP-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; NO-VP-OUTLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. +; NO-VP-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-VP-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; NO-VP-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; NO-VP-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; NO-VP-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; NO-VP-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; NO-VP-INLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll new file mode 100644 index 00000000000000..73dc3e4313a651 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll @@ -0,0 +1,1965 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP + +define i32 @add(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @add( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-LABEL: @add( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD]] = add nsw i32 [[TMP12]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %add +} + +; not support mul reduction for scalable vector +define i32 @mul(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @mul( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]] +; +; NO-VP-LABEL: @mul( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = mul i32 [[TMP6]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[TMP9]] = mul i32 [[TMP8]], [[VEC_PHI1]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP9]], [[TMP7]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MUL]] = mul nsw i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[MUL_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %mul, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %mul = mul nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %mul +} + +define i32 @or(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @or( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.or.nxv4i32(i32 0, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15]] = or i32 [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[OR]] = or i32 [[TMP18]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[OR_LCSSA]] +; +; NO-VP-LABEL: @or( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP10]] = or i32 [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[OR]] = or i32 [[TMP12]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[OR_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %or, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %or = or i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %or +} + +define i32 @and(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @and( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.and.nxv4i32(i32 -1, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15]] = and i32 [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[AND]] = and i32 [[TMP18]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[AND_LCSSA]] +; +; NO-VP-LABEL: @and( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP10]] = and i32 [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[AND]] = and i32 [[TMP12]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[AND_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %and, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %and = and i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %and +} + +define i32 @xor(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @xor( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 0, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15]] = xor i32 [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP18]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]] +; +; NO-VP-LABEL: @xor( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP10]] = xor i32 [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[XOR]] = xor i32 [[TMP12]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[XOR_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %xor, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %xor = xor i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %xor +} + +define i32 @smin(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @smin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP14]], i32 [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] +; +; NO-VP-LABEL: @smin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP9]], i32 [[VEC_PHI]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[SMIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %smin, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp slt i32 %0, %rdx + %smin = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %smin +} + +define i32 @smax(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @smax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -2147483648, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]] +; +; NO-VP-LABEL: @smax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP9]], i32 [[VEC_PHI]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[SMAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %smax, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp sgt i32 %0, %rdx + %smax = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %smax +} + +define i32 @umin(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @umin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 -1, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP14]], i32 [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]] +; +; NO-VP-LABEL: @umin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP9]], i32 [[VEC_PHI]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[UMIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %umin, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp ult i32 %0, %rdx + %umin = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %umin +} + +define i32 @umax(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @umax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 0, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP14]], i32 [[VEC_PHI]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]] +; +; NO-VP-LABEL: @umax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP9]], i32 [[VEC_PHI]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[UMAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %umax, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp ugt i32 %0, %rdx + %umax = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %umax +} + +define float @fadd(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fadd( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15]] = fadd reassoc float [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP18]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret float [[ADD_LCSSA]] +; +; NO-VP-LABEL: @fadd( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP10]] = fadd reassoc float [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD]] = fadd reassoc float [[TMP12]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %add = fadd reassoc float %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +; not support fmul reduction for scalable vector +define float @fmul(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fmul( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MUL_LCSSA]] +; +; NO-VP-LABEL: @fmul( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = fmul reassoc float [[TMP6]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[TMP9]] = fmul reassoc float [[TMP8]], [[VEC_PHI1]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[BIN_RDX:%.*]] = fmul reassoc float [[TMP9]], [[TMP7]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MUL]] = fmul reassoc float [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MUL_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %mul, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %mul = fmul reassoc float %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %mul +} + +define float @fmin(ptr %a, i64 %n, float %start) #0 { +; IF-EVL-LABEL: @fmin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x7FF0000000000000, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret float [[MIN_LCSSA]] +; +; NO-VP-LABEL: @fmin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP11]], float [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %min, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp = fcmp fast olt float %0, %rdx + %min = select i1 %cmp, float %0, float %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %min +} + +define float @fmax(ptr %a, i64 %n, float %start) #0 { +; IF-EVL-LABEL: @fmax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xFFF0000000000000, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP14]], [[VEC_PHI]] +; IF-EVL-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP17]], [[RDX]] +; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret float [[MAX_LCSSA]] +; +; NO-VP-LABEL: @fmax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[WIDE_LOAD]]) +; NO-VP-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP9]], [[VEC_PHI]] +; NO-VP-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP11]], float [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp = fcmp fast ogt float %0, %rdx + %max = select i1 %cmp, float %0, float %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %max +} + +define float @fminimum(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fminimum( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MIN_LCSSA]] +; +; NO-VP-LABEL: @fminimum( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]]) +; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[RDX_MINMAX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP10]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %min, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %min = tail call float @llvm.minimum.f32(float %rdx, float %0) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %min +} + +define float @fmaximum(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fmaximum( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MAX_LCSSA]] +; +; NO-VP-LABEL: @fmaximum( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]]) +; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[RDX_MINMAX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP10]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %max = tail call float @llvm.maximum.f32(float %rdx, float %0) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %max +} + +define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { +; IF-EVL-LABEL: @fmuladd( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = fmul reassoc [[VP_OP_LOAD]], [[VP_OP_LOAD1]] +; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP18]] = fadd reassoc float [[TMP17]], [[VEC_PHI]] +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret float [[MULADD_LCSSA]] +; +; NO-VP-LABEL: @fmuladd( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 4 +; NO-VP-NEXT: [[TMP11:%.*]] = fmul reassoc [[WIDE_LOAD]], [[WIDE_LOAD1]] +; NO-VP-NEXT: [[TMP12:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP11]]) +; NO-VP-NEXT: [[TMP13]] = fadd reassoc float [[TMP12]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[RDX]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MULADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv + %1 = load float, ptr %arrayidx2, align 4 + %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %rdx) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %muladd +} + +define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { +; IF-EVL-LABEL: @anyof_icmp( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] +; +; NO-VP-LABEL: @anyof_icmp( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-NEXT: [[TMP10]] = or [[VEC_PHI]], [[TMP9]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP10]]) +; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP14]], 3 +; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp slt i32 %0, 3 + %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %anyof +} + +define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { +; IF-EVL-LABEL: @anyof_fcmp( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] +; +; NO-VP-LABEL: @anyof_fcmp( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; NO-VP-NEXT: [[TMP10]] = or [[VEC_PHI]], [[TMP9]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP10]]) +; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP14]], 3.000000e+00 +; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp.i = fcmp fast olt float %0, 3.0 + %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %anyof +} + +declare float @llvm.minimum.f32(float, float) +declare float @llvm.maximum.f32(float, float) +declare float @llvm.fmuladd.f32(float, float, float) + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll new file mode 100644 index 00000000000000..fcea9e8d81ff65 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP + +define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) { +; IF-EVL-OUTLOOP-LABEL: define void @reduction_intermediate_store( +; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-OUTLOOP-NEXT: entry: +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-OUTLOOP: for.body: +; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP27]], [[RDX]] +; IF-EVL-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 +; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-OUTLOOP: for.end: +; IF-EVL-OUTLOOP-NEXT: ret void +; +; IF-EVL-INLOOP-LABEL: define void @reduction_intermediate_store( +; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-INLOOP-NEXT: entry: +; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; IF-EVL-INLOOP: vector.memcheck: +; IF-EVL-INLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4 +; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = shl i64 [[N]], 2 +; IF-EVL-INLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]] +; IF-EVL-INLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]] +; IF-EVL-INLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; IF-EVL-INLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; IF-EVL-INLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP10]] +; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL-INLOOP: vector.body: +; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP13]], i32 4, i1 true) +; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]] +; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 +; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]), !alias.scope [[META0:![0-9]+]] +; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP14]]) +; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] +; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP14]] to i64 +; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] +; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-INLOOP: middle.block: +; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4 +; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP: scalar.ph: +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL-INLOOP: for.body: +; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; IF-EVL-INLOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP25]], [[RDX]] +; IF-EVL-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 +; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP: for.end: +; IF-EVL-INLOOP-NEXT: ret void +; +; NO-VP-OUTLOOP-LABEL: define void @reduction_intermediate_store( +; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-OUTLOOP-NEXT: entry: +; NO-VP-OUTLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; NO-VP-OUTLOOP: vector.memcheck: +; NO-VP-OUTLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4 +; NO-VP-OUTLOOP-NEXT: [[TMP3:%.*]] = shl i64 [[N]], 2 +; NO-VP-OUTLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; NO-VP-OUTLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]] +; NO-VP-OUTLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; NO-VP-OUTLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; NO-VP-OUTLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]] +; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-OUTLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-OUTLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-OUTLOOP-NEXT: [[TMP8:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-OUTLOOP: vector.body: +; NO-VP-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; NO-VP-OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]] +; NO-VP-OUTLOOP-NEXT: [[TMP12]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-OUTLOOP: middle.block: +; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) +; NO-VP-OUTLOOP-NEXT: store i32 [[TMP14]], ptr [[ADDR]], align 4 +; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-OUTLOOP: scalar.ph: +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-OUTLOOP: for.body: +; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-OUTLOOP-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP15]], [[RDX]] +; NO-VP-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 +; NO-VP-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-OUTLOOP: for.end: +; NO-VP-OUTLOOP-NEXT: ret void +; +; NO-VP-INLOOP-LABEL: define void @reduction_intermediate_store( +; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-INLOOP-NEXT: entry: +; NO-VP-INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; NO-VP-INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; NO-VP-INLOOP: vector.memcheck: +; NO-VP-INLOOP-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4 +; NO-VP-INLOOP-NEXT: [[TMP3:%.*]] = shl i64 [[N]], 2 +; NO-VP-INLOOP-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; NO-VP-INLOOP-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ADDR]], [[SCEVGEP1]] +; NO-VP-INLOOP-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; NO-VP-INLOOP-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; NO-VP-INLOOP-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]] +; NO-VP-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-INLOOP-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-INLOOP-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-INLOOP: vector.body: +; NO-VP-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; NO-VP-INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 4, !alias.scope [[META0:![0-9]+]] +; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[WIDE_LOAD]]) +; NO-VP-INLOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; NO-VP-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; NO-VP-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-INLOOP: middle.block: +; NO-VP-INLOOP-NEXT: store i32 [[TMP12]], ptr [[ADDR]], align 4 +; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP-INLOOP: scalar.ph: +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-INLOOP: for.body: +; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-INLOOP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-INLOOP-NEXT: [[ADD]] = add nsw i32 [[TMP14]], [[RDX]] +; NO-VP-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 +; NO-VP-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP-INLOOP: for.end: +; NO-VP-INLOOP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + store i32 %add, ptr %addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} +; IF-EVL-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; IF-EVL-INLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} +; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +;. +; NO-VP-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} +; NO-VP-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; NO-VP-OUTLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; NO-VP-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} +; NO-VP-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +;. +; NO-VP-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} +; NO-VP-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} +; NO-VP-INLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} +; NO-VP-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} +; NO-VP-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll new file mode 100644 index 00000000000000..314d30f86ee57d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP + +define float @fadd(ptr noalias nocapture readonly %a, i64 %n) { +; IF-EVL-LABEL: @fadd( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP14]] = call float @llvm.vp.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd float [[TMP17]], [[SUM_07]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: ret float [[ADD_LCSSA]] +; +; NO-VP-LABEL: @fadd( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; NO-VP-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll new file mode 100644 index 00000000000000..2bbcd362ce16c8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -0,0 +1,1534 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP + +define i32 @add(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @add( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[ADD_LCSSA]] +; +; NO-VP-LABEL: @add( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD]] = add nsw i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %add +} + +; not support mul reduction for scalable vector +define i32 @mul(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @mul( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]] +; +; NO-VP-LABEL: @mul( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; NO-VP-NEXT: [[TMP7]] = mul <8 x i32> [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP8]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP8]], [[TMP7]] +; NO-VP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[BIN_RDX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MUL]] = mul nsw i32 [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[MUL_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %mul, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %mul = mul nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %mul +} + +define i32 @or(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @or( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[OR]] = or i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[OR_LCSSA]] +; +; NO-VP-LABEL: @or( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10]] = or [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[OR]] = or i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[OR_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %or, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %or = or i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %or +} + +define i32 @and(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @and( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[AND]] = and i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[AND_LCSSA]] +; +; NO-VP-LABEL: @and( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer), i32 [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10]] = and [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[AND]] = and i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[AND_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %and, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %and = and i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %and +} + +define i32 @xor(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @xor( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[XOR]] = xor i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[XOR_LCSSA]] +; +; NO-VP-LABEL: @xor( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10]] = xor [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[XOR]] = xor i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[XOR_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %xor, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %xor = xor i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %xor +} + +define i32 @smin(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @smin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]] +; +; NO-VP-LABEL: @smin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[SMIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %smin, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp slt i32 %0, %rdx + %smin = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %smin +} + +define i32 @smax(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @smax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[SMAX_LCSSA]] +; +; NO-VP-LABEL: @smax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp sgt [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[SMAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %smax, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp sgt i32 %0, %rdx + %smax = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %smax +} + +define i32 @umin(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @umin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[UMIN_LCSSA]] +; +; NO-VP-LABEL: @umin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp ult [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[UMIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %umin, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp ult i32 %0, %rdx + %umin = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %umin +} + +define i32 @umax(ptr %a, i64 %n, i32 %start) { +; IF-EVL-LABEL: @umax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP0]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[UMAX_LCSSA]] +; +; NO-VP-LABEL: @umax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp ugt [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP13]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[UMAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %umax, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp ugt i32 %0, %rdx + %umax = select i1 %cmp.i, i32 %0, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %umax +} + +define float @fadd(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fadd( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD]] = fadd reassoc float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[ADD_LCSSA]] +; +; NO-VP-LABEL: @fadd( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10]] = fadd reassoc [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD]] = fadd reassoc float [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %add = fadd reassoc float %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +; not support fmul reduction for scalable vector +define float @fmul(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fmul( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MUL_LCSSA]] +; +; NO-VP-LABEL: @fmul( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP0:%.*]] = insertelement <8 x float> , float [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4 +; NO-VP-NEXT: [[TMP7]] = fmul reassoc <8 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP8]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[BIN_RDX:%.*]] = fmul reassoc <8 x float> [[TMP8]], [[TMP7]] +; NO-VP-NEXT: [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[BIN_RDX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MUL]] = fmul reassoc float [[TMP11]], [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MUL_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %mul, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %mul = fmul reassoc float %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %mul +} + +define float @fmin(ptr %a, i64 %n, float %start) #0 { +; IF-EVL-LABEL: @fmin( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MIN_LCSSA]] +; +; NO-VP-LABEL: @fmin( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[MIN]] = select i1 [[CMP]], float [[TMP13]], float [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %min, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp = fcmp fast olt float %0, %rdx + %min = select i1 %cmp, float %0, float %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %min +} + +define float @fmax(ptr %a, i64 %n, float %start) #0 { +; IF-EVL-LABEL: @fmax( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP0]], [[RDX]] +; IF-EVL-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP0]], float [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MAX_LCSSA]] +; +; NO-VP-LABEL: @fmax( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast ogt [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[TMP10]] = select [[TMP9]], [[WIDE_LOAD]], [[VEC_PHI]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP10]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[TMP13]], [[RDX]] +; NO-VP-NEXT: [[MAX]] = select i1 [[CMP]], float [[TMP13]], float [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp = fcmp fast ogt float %0, %rdx + %max = select i1 %cmp, float %0, float %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %max +} + +define float @fminimum(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fminimum( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MIN_LCSSA]] +; +; NO-VP-LABEL: @fminimum( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]]) +; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[RDX_MINMAX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP10]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MIN_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %min, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %min = tail call float @llvm.minimum.f32(float %rdx, float %0) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %min +} + +define float @fmaximum(ptr %a, i64 %n, float %start) { +; IF-EVL-LABEL: @fmaximum( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MAX_LCSSA]] +; +; NO-VP-LABEL: @fmaximum( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 +; NO-VP-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <8 x float> [[MINMAX_IDENT_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 +; NO-VP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; NO-VP-NEXT: [[TMP6]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]]) +; NO-VP-NEXT: [[TMP7]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[RDX_MINMAX:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP6]], <8 x float> [[TMP7]]) +; NO-VP-NEXT: [[TMP9:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[RDX_MINMAX]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP10]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MAX_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %max = tail call float @llvm.maximum.f32(float %rdx, float %0) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %max +} + +define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { +; IF-EVL-LABEL: @fmuladd( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[RDX]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret float [[MULADD_LCSSA]] +; +; NO-VP-LABEL: @fmuladd( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[START:%.*]], i32 0 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; NO-VP-NEXT: [[TMP12]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD]], [[WIDE_LOAD1]], [[VEC_PHI]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP14:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP12]]) +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; NO-VP-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP15]], float [[TMP16]], float [[RDX]]) +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret float [[MULADD_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi float [ %start, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv + %1 = load float, ptr %arrayidx2, align 4 + %muladd = tail call reassoc float @llvm.fmuladd.f32(float %0, float %1, float %rdx) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %muladd +} + +define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { +; IF-EVL-LABEL: @anyof_icmp( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP0]], 3 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] +; +; NO-VP-LABEL: @anyof_icmp( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; NO-VP-NEXT: [[TMP10]] = or [[VEC_PHI]], [[TMP9]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP10]]) +; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP14]], 3 +; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp.i = icmp slt i32 %0, 3 + %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %anyof +} + +define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { +; IF-EVL-LABEL: @anyof_fcmp( +; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-EVL: for.body: +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] +; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP0]], 3.000000e+00 +; IF-EVL-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV:%.*]], i32 [[RDX]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]] +; IF-EVL: for.end: +; IF-EVL-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ] +; IF-EVL-NEXT: ret i32 [[ANYOF_LCSSA]] +; +; NO-VP-LABEL: @anyof_fcmp( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NO-VP-NEXT: [[TMP9:%.*]] = fcmp fast olt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; NO-VP-NEXT: [[TMP10]] = or [[VEC_PHI]], [[TMP9]] +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP10]]) +; NO-VP-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; NO-VP-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[INV:%.*]], i32 [[START:%.*]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP14]], 3.000000e+00 +; NO-VP-NEXT: [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]] +; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; NO-VP: for.end: +; NO-VP-NEXT: [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; NO-VP-NEXT: ret i32 [[ANYOF_LCSSA]] +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %anyof, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load float, ptr %arrayidx, align 4 + %cmp.i = fcmp fast olt float %0, 3.0 + %anyof = select i1 %cmp.i, i32 %inv, i32 %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %anyof +} + +declare float @llvm.minimum.f32(float, float) +declare float @llvm.maximum.f32(float, float) +declare float @llvm.fmuladd.f32(float, float, float) + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll new file mode 100644 index 00000000000000..16db6cf828af8a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -0,0 +1,166 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-OUTLOOP %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-INLOOP %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-OUTLOOP %s + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-INLOOP %s + + +define i32 @reduction(ptr %a, i64 %n, i32 %start) { +; IF-EVL-OUTLOOP-NOT: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { + +; IF-EVL-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-INLOOP-NEXT: Live-in ir<%n> = original trip-count +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP: vector.ph: +; IF-EVL-INLOOP-NEXT: Successor(s): vector loop +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: vector loop: { +; IF-EVL-INLOOP-NEXT: vector.body: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-INLOOP-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]> +; IF-EVL-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%n> +; IF-EVL-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>) +; IF-EVL-INLOOP-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-NEXT: } +; IF-EVL-INLOOP-NEXT: Successor(s): middle.block +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: middle.block: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir +; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: ir-bb: +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-INLOOP-NEXT: } +; + +; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; NO-VP-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP: vector.ph: +; NO-VP-OUTLOOP-NEXT: Successor(s): vector loop +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: vector loop: { +; NO-VP-OUTLOOP-NEXT: vector.body: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-OUTLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; NO-VP-OUTLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-OUTLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-OUTLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; NO-VP-OUTLOOP-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-OUTLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-NEXT: } +; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: middle.block: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> +; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: ir-bb: +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-OUTLOOP-NEXT: } +; + +; NO-VP-INLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; NO-VP-INLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; NO-VP-INLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; NO-VP-INLOOP-NEXT: Live-in ir<%n> = original trip-count +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP: vector.ph: +; NO-VP-INLOOP-NEXT: Successor(s): vector loop +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: vector loop: { +; NO-VP-INLOOP-NEXT: vector.body: +; NO-VP-INLOOP-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; NO-VP-INLOOP-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]> +; NO-VP-INLOOP-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; NO-VP-INLOOP-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; NO-VP-INLOOP-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; NO-VP-INLOOP-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; NO-VP-INLOOP-NEXT: REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>) +; NO-VP-INLOOP-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; NO-VP-INLOOP-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-NEXT: } +; NO-VP-INLOOP-NEXT: Successor(s): middle.block +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: middle.block: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> +; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: ir-bb: +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-INLOOP-NEXT: } +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %0, %rdx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret i32 %add +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll new file mode 100644 index 00000000000000..42a9ab0ca270fa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll @@ -0,0 +1,437 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mtriple x86_64 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mcpu=skylake-avx512 -S %s | FileCheck %s + +define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { +; CHECK-LABEL: define void @sdiv_feeding_gep( +; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]] +; CHECK: [[PRED_SDIV_IF]]: +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]] +; CHECK: [[PRED_SDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]] +; CHECK: [[PRED_SDIV_IF3]]: +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]] +; CHECK: [[PRED_SDIV_CONTINUE4]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE]] ], [ [[TMP11]], %[[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]] +; CHECK: [[PRED_SDIV_IF5]]: +; CHECK-NEXT: [[TMP14:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]] +; CHECK: [[PRED_SDIV_CONTINUE6]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP14]], %[[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3 +; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_SDIV_IF7:.*]], label %[[PRED_SDIV_CONTINUE8]] +; CHECK: [[PRED_SDIV_IF7]]: +; CHECK-NEXT: [[TMP17:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE8]] +; CHECK: [[PRED_SDIV_CONTINUE8]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE6]] ], [ [[TMP17]], %[[PRED_SDIV_IF7]] ] +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP9]], [[CONV61]] +; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[TMP5]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = mul i32 [[X]], [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP27]], i32 8, <4 x i1> [[TMP6]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DIV18:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: [[CONV20:%.*]] = trunc i64 [[DIV18]] to i32 +; CHECK-NEXT: [[MUL30:%.*]] = mul i64 [[DIV18]], [[CONV61]] +; CHECK-NEXT: [[SUB31:%.*]] = sub i64 [[IV]], [[MUL30]] +; CHECK-NEXT: [[CONV34:%.*]] = trunc i64 [[SUB31]] to i32 +; CHECK-NEXT: [[MUL35:%.*]] = mul i32 [[X]], [[CONV20]] +; CHECK-NEXT: [[ADD36:%.*]] = add i32 [[MUL35]], [[CONV34]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD36]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[DST]], i64 [[IDXPROM]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %conv61 = zext i32 %x to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div18 = sdiv i64 %M, %conv6 + %conv20 = trunc i64 %div18 to i32 + %mul30 = mul i64 %div18, %conv61 + %sub31 = sub i64 %iv, %mul30 + %conv34 = trunc i64 %sub31 to i32 + %mul35 = mul i32 %x, %conv20 + %add36 = add i32 %mul35, %conv34 + %idxprom = sext i32 %add36 to i64 + %gep = getelementptr double, ptr %dst, i64 %idxprom + store double 0.000000e+00, ptr %gep, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { +; CHECK-LABEL: define void @sdiv_feeding_gep_predicated( +; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], i64 [[M:%.*]], i64 [[CONV6:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[M]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE8:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_SDIV_CONTINUE8]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]] +; CHECK: [[PRED_SDIV_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]] +; CHECK: [[PRED_SDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]] +; CHECK: [[PRED_SDIV_IF3]]: +; CHECK-NEXT: [[TMP13:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]] +; CHECK: [[PRED_SDIV_CONTINUE4]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], %[[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]] +; CHECK: [[PRED_SDIV_IF5]]: +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]] +; CHECK: [[PRED_SDIV_CONTINUE6]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP16]], %[[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_SDIV_IF7:.*]], label %[[PRED_SDIV_CONTINUE8]] +; CHECK: [[PRED_SDIV_IF7]]: +; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE8]] +; CHECK: [[PRED_SDIV_CONTINUE8]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE6]] ], [ [[TMP19]], %[[PRED_SDIV_IF7]] ] +; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP11]], [[CONV61]] +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[TMP5]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[X]], [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP25]], [[TMP24]] +; CHECK-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP29]], i32 8, <4 x i1> [[TMP8]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[C:%.*]] = icmp ule i64 [[IV]], [[M]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[DIV18:%.*]] = sdiv i64 [[M]], [[CONV6]] +; CHECK-NEXT: [[CONV20:%.*]] = trunc i64 [[DIV18]] to i32 +; CHECK-NEXT: [[MUL30:%.*]] = mul i64 [[DIV18]], [[CONV61]] +; CHECK-NEXT: [[SUB31:%.*]] = sub i64 [[IV]], [[MUL30]] +; CHECK-NEXT: [[CONV34:%.*]] = trunc i64 [[SUB31]] to i32 +; CHECK-NEXT: [[MUL35:%.*]] = mul i32 [[X]], [[CONV20]] +; CHECK-NEXT: [[ADD36:%.*]] = add i32 [[MUL35]], [[CONV34]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD36]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[DST]], i64 [[IDXPROM]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP]], align 8 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %conv61 = zext i32 %x to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %c = icmp ule i64 %iv, %M + br i1 %c, label %then, label %loop.latch + +then: + %div18 = sdiv i64 %M, %conv6 + %conv20 = trunc i64 %div18 to i32 + %mul30 = mul i64 %div18, %conv61 + %sub31 = sub i64 %iv, %mul30 + %conv34 = trunc i64 %sub31 to i32 + %mul35 = mul i32 %x, %conv20 + %add36 = add i32 %mul35, %conv34 + %idxprom = sext i32 %add36 to i64 + %gep = getelementptr double, ptr %dst, i64 %idxprom + store double 0.000000e+00, ptr %gep, align 8 + br label %loop.latch + +loop.latch: + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { +; CHECK-LABEL: define void @udiv_urem_feeding_gep( +; CHECK-SAME: i64 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MUL_1_I:%.*]] = mul i64 [[X]], [[X]] +; CHECK-NEXT: [[MUL_2_I:%.*]] = mul i64 [[MUL_1_I]], [[X]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[N]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[N]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_UREM_CONTINUE6]] ] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]] +; CHECK: [[PRED_UREM_IF]]: +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = udiv i64 [[TMP7]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP7]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[TMP10]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP12:%.*]] = urem i64 [[TMP10]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP13:%.*]] = udiv i64 [[TMP12]], [[X]] +; CHECK-NEXT: [[TMP14:%.*]] = urem i64 [[TMP12]], [[X]] +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]] +; CHECK: [[PRED_UREM_CONTINUE]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP11]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_UREM_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]] +; CHECK: [[PRED_UREM_IF1]]: +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = udiv i64 [[TMP22]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = urem i64 [[TMP22]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP26:%.*]] = udiv i64 [[TMP25]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP27:%.*]] = urem i64 [[TMP25]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP28:%.*]] = udiv i64 [[TMP27]], [[X]] +; CHECK-NEXT: [[TMP29:%.*]] = urem i64 [[TMP27]], [[X]] +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE2]] +; CHECK: [[PRED_UREM_CONTINUE2]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_UREM_CONTINUE]] ], [ [[TMP24]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP25]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP26]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP27]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP28]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP29]], %[[PRED_UREM_IF1]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 +; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]] +; CHECK: [[PRED_UREM_IF3]]: +; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP38:%.*]] = udiv i64 [[TMP37]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP38]], i32 2 +; CHECK-NEXT: [[TMP40:%.*]] = urem i64 [[TMP37]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP41:%.*]] = udiv i64 [[TMP40]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP42:%.*]] = urem i64 [[TMP40]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP43:%.*]] = udiv i64 [[TMP42]], [[X]] +; CHECK-NEXT: [[TMP44:%.*]] = urem i64 [[TMP42]], [[X]] +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE4]] +; CHECK: [[PRED_UREM_CONTINUE4]]: +; CHECK-NEXT: [[TMP45:%.*]] = phi <4 x i64> [ [[TMP30]], %[[PRED_UREM_CONTINUE2]] ], [ [[TMP39]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP46:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP40]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP47:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP41]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP48:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP42]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP49:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP43]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP50:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP44]], %[[PRED_UREM_IF3]] ] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 +; CHECK-NEXT: br i1 [[TMP51]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]] +; CHECK: [[PRED_UREM_IF5]]: +; CHECK-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP53:%.*]] = udiv i64 [[TMP52]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i64> [[TMP45]], i64 [[TMP53]], i32 3 +; CHECK-NEXT: [[TMP55:%.*]] = urem i64 [[TMP52]], [[MUL_2_I]] +; CHECK-NEXT: [[TMP56:%.*]] = udiv i64 [[TMP55]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP57:%.*]] = urem i64 [[TMP55]], [[MUL_1_I]] +; CHECK-NEXT: [[TMP58:%.*]] = udiv i64 [[TMP57]], [[X]] +; CHECK-NEXT: [[TMP59:%.*]] = urem i64 [[TMP57]], [[X]] +; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] +; CHECK: [[PRED_UREM_CONTINUE6]]: +; CHECK-NEXT: [[TMP60:%.*]] = phi <4 x i64> [ [[TMP45]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP54]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP61:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP55]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP62:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP56]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP63:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP57]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP64:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP58]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP65:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP59]], %[[PRED_UREM_IF5]] ] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i64> [[TMP60]], i32 0 +; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[X]], [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[TMP67]], [[TMP17]] +; CHECK-NEXT: [[TMP69:%.*]] = mul i64 [[TMP68]], [[X]] +; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[TMP69]], [[TMP19]] +; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP70]], [[X]] +; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[TMP71]], [[TMP20]] +; CHECK-NEXT: [[TMP73:%.*]] = shl i64 [[TMP72]], 32 +; CHECK-NEXT: [[TMP74:%.*]] = ashr i64 [[TMP73]], 32 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP74]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i64, ptr [[TMP75]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP60]], ptr [[TMP76]], i32 4, <4 x i1> [[TMP5]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP77]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DIV_I:%.*]] = udiv i64 [[IV]], [[MUL_2_I]] +; CHECK-NEXT: [[REM_I:%.*]] = urem i64 [[IV]], [[MUL_2_I]] +; CHECK-NEXT: [[DIV_1_I:%.*]] = udiv i64 [[REM_I]], [[MUL_1_I]] +; CHECK-NEXT: [[REM_1_I:%.*]] = urem i64 [[REM_I]], [[MUL_1_I]] +; CHECK-NEXT: [[DIV_2_I:%.*]] = udiv i64 [[REM_1_I]], [[X]] +; CHECK-NEXT: [[REM_2_I:%.*]] = urem i64 [[REM_1_I]], [[X]] +; CHECK-NEXT: [[MUL_I:%.*]] = mul i64 [[X]], [[DIV_I]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i64 [[MUL_I]], [[DIV_1_I]] +; CHECK-NEXT: [[MUL_1_I9:%.*]] = mul i64 [[ADD_I]], [[X]] +; CHECK-NEXT: [[ADD_1_I:%.*]] = add i64 [[MUL_1_I9]], [[DIV_2_I]] +; CHECK-NEXT: [[MUL_2_I11:%.*]] = mul i64 [[ADD_1_I]], [[X]] +; CHECK-NEXT: [[ADD_2_I:%.*]] = add i64 [[MUL_2_I11]], [[REM_2_I]] +; CHECK-NEXT: [[SEXT_I:%.*]] = shl i64 [[ADD_2_I]], 32 +; CHECK-NEXT: [[CONV6_I:%.*]] = ashr i64 [[SEXT_I]], 32 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[CONV6_I]] +; CHECK-NEXT: store i64 [[DIV_I]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %mul.1.i = mul i64 %x, %x + %mul.2.i = mul i64 %mul.1.i, %x + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div.i = udiv i64 %iv, %mul.2.i + %rem.i = urem i64 %iv, %mul.2.i + %div.1.i = udiv i64 %rem.i, %mul.1.i + %rem.1.i = urem i64 %rem.i, %mul.1.i + %div.2.i = udiv i64 %rem.1.i, %x + %rem.2.i = urem i64 %rem.1.i, %x + %mul.i = mul i64 %x, %div.i + %add.i = add i64 %mul.i, %div.1.i + %mul.1.i9 = mul i64 %add.i, %x + %add.1.i = add i64 %mul.1.i9, %div.2.i + %mul.2.i11 = mul i64 %add.1.i, %x + %add.2.i = add i64 %mul.2.i11, %rem.2.i + %sext.i = shl i64 %add.2.i, 32 + %conv6.i = ashr i64 %sext.i, 32 + %gep = getelementptr i64, ptr %dst, i64 %conv6.i + store i64 %div.i, ptr %gep, align 4 + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/ephemeral-recipes.ll b/llvm/test/Transforms/LoopVectorize/X86/ephemeral-recipes.ll index 450caccefb7584..8cee513b1802b2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/ephemeral-recipes.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/ephemeral-recipes.ll @@ -8,313 +8,17 @@ define i32 @ephemeral_load_and_compare_iv_used_outside(ptr %start, ptr %end) #0 ; CHECK-LABEL: define i32 @ephemeral_load_and_compare_iv_used_outside( ; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[END2:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEXT: [[START1:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[START1]], [[END2]] -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 128 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 128 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], -8 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> [[TMP4]], i32 4, <32 x i1> , <32 x i32> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> [[TMP5]], i32 4, <32 x i1> , <32 x i32> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> [[TMP6]], i32 4, <32 x i1> , <32 x i32> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> [[TMP7]], i32 4, <32 x i1> , <32 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i32> [[WIDE_MASKED_GATHER3]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i32> [[WIDE_MASKED_GATHER4]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i32> [[WIDE_MASKED_GATHER5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP12]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i1> [[TMP8]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP13]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <32 x i1> [[TMP8]], i32 2 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP14]]) -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <32 x i1> [[TMP8]], i32 3 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP15]]) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <32 x i1> [[TMP8]], i32 4 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP16]]) -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <32 x i1> [[TMP8]], i32 5 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP17]]) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i1> [[TMP8]], i32 6 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP18]]) -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <32 x i1> [[TMP8]], i32 7 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP19]]) -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <32 x i1> [[TMP8]], i32 8 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP20]]) -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i1> [[TMP8]], i32 9 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP21]]) -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i1> [[TMP8]], i32 10 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP22]]) -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i1> [[TMP8]], i32 11 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP23]]) -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <32 x i1> [[TMP8]], i32 12 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP24]]) -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <32 x i1> [[TMP8]], i32 13 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP25]]) -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <32 x i1> [[TMP8]], i32 14 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP26]]) -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP8]], i32 15 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP27]]) -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i1> [[TMP8]], i32 16 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP28]]) -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <32 x i1> [[TMP8]], i32 17 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP29]]) -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <32 x i1> [[TMP8]], i32 18 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP30]]) -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <32 x i1> [[TMP8]], i32 19 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP31]]) -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i1> [[TMP8]], i32 20 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP32]]) -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <32 x i1> [[TMP8]], i32 21 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP33]]) -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <32 x i1> [[TMP8]], i32 22 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP34]]) -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <32 x i1> [[TMP8]], i32 23 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP35]]) -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <32 x i1> [[TMP8]], i32 24 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP36]]) -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <32 x i1> [[TMP8]], i32 25 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP37]]) -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <32 x i1> [[TMP8]], i32 26 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP38]]) -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <32 x i1> [[TMP8]], i32 27 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP39]]) -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <32 x i1> [[TMP8]], i32 28 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP40]]) -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <32 x i1> [[TMP8]], i32 29 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP41]]) -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <32 x i1> [[TMP8]], i32 30 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP42]]) -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i1> [[TMP8]], i32 31 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP43]]) -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <32 x i1> [[TMP9]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP44]]) -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <32 x i1> [[TMP9]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP45]]) -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <32 x i1> [[TMP9]], i32 2 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP46]]) -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <32 x i1> [[TMP9]], i32 3 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP47]]) -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <32 x i1> [[TMP9]], i32 4 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP48]]) -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <32 x i1> [[TMP9]], i32 5 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP49]]) -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <32 x i1> [[TMP9]], i32 6 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP50]]) -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <32 x i1> [[TMP9]], i32 7 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP51]]) -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <32 x i1> [[TMP9]], i32 8 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP52]]) -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <32 x i1> [[TMP9]], i32 9 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP53]]) -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <32 x i1> [[TMP9]], i32 10 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP54]]) -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <32 x i1> [[TMP9]], i32 11 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP55]]) -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <32 x i1> [[TMP9]], i32 12 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP56]]) -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <32 x i1> [[TMP9]], i32 13 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP57]]) -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <32 x i1> [[TMP9]], i32 14 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP58]]) -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <32 x i1> [[TMP9]], i32 15 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP59]]) -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <32 x i1> [[TMP9]], i32 16 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP60]]) -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <32 x i1> [[TMP9]], i32 17 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP61]]) -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <32 x i1> [[TMP9]], i32 18 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP62]]) -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <32 x i1> [[TMP9]], i32 19 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP63]]) -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <32 x i1> [[TMP9]], i32 20 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP64]]) -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <32 x i1> [[TMP9]], i32 21 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP65]]) -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <32 x i1> [[TMP9]], i32 22 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP66]]) -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <32 x i1> [[TMP9]], i32 23 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP67]]) -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <32 x i1> [[TMP9]], i32 24 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP68]]) -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <32 x i1> [[TMP9]], i32 25 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP69]]) -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <32 x i1> [[TMP9]], i32 26 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP70]]) -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <32 x i1> [[TMP9]], i32 27 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP71]]) -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <32 x i1> [[TMP9]], i32 28 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP72]]) -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <32 x i1> [[TMP9]], i32 29 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP73]]) -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <32 x i1> [[TMP9]], i32 30 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP74]]) -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <32 x i1> [[TMP9]], i32 31 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP75]]) -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <32 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP76]]) -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <32 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP77]]) -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <32 x i1> [[TMP10]], i32 2 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP78]]) -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <32 x i1> [[TMP10]], i32 3 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP79]]) -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <32 x i1> [[TMP10]], i32 4 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP80]]) -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <32 x i1> [[TMP10]], i32 5 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP81]]) -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <32 x i1> [[TMP10]], i32 6 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP82]]) -; CHECK-NEXT: [[TMP83:%.*]] = extractelement <32 x i1> [[TMP10]], i32 7 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP83]]) -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <32 x i1> [[TMP10]], i32 8 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP84]]) -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <32 x i1> [[TMP10]], i32 9 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP85]]) -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <32 x i1> [[TMP10]], i32 10 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP86]]) -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <32 x i1> [[TMP10]], i32 11 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP87]]) -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <32 x i1> [[TMP10]], i32 12 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP88]]) -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <32 x i1> [[TMP10]], i32 13 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP89]]) -; CHECK-NEXT: [[TMP90:%.*]] = extractelement <32 x i1> [[TMP10]], i32 14 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP90]]) -; CHECK-NEXT: [[TMP91:%.*]] = extractelement <32 x i1> [[TMP10]], i32 15 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP91]]) -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <32 x i1> [[TMP10]], i32 16 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP92]]) -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <32 x i1> [[TMP10]], i32 17 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP93]]) -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <32 x i1> [[TMP10]], i32 18 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP94]]) -; CHECK-NEXT: [[TMP95:%.*]] = extractelement <32 x i1> [[TMP10]], i32 19 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP95]]) -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <32 x i1> [[TMP10]], i32 20 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP96]]) -; CHECK-NEXT: [[TMP97:%.*]] = extractelement <32 x i1> [[TMP10]], i32 21 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP97]]) -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <32 x i1> [[TMP10]], i32 22 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP98]]) -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <32 x i1> [[TMP10]], i32 23 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP99]]) -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <32 x i1> [[TMP10]], i32 24 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP100]]) -; CHECK-NEXT: [[TMP101:%.*]] = extractelement <32 x i1> [[TMP10]], i32 25 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP101]]) -; CHECK-NEXT: [[TMP102:%.*]] = extractelement <32 x i1> [[TMP10]], i32 26 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP102]]) -; CHECK-NEXT: [[TMP103:%.*]] = extractelement <32 x i1> [[TMP10]], i32 27 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP103]]) -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <32 x i1> [[TMP10]], i32 28 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP104]]) -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <32 x i1> [[TMP10]], i32 29 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP105]]) -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <32 x i1> [[TMP10]], i32 30 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP106]]) -; CHECK-NEXT: [[TMP107:%.*]] = extractelement <32 x i1> [[TMP10]], i32 31 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP107]]) -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <32 x i1> [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP108]]) -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <32 x i1> [[TMP11]], i32 1 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP109]]) -; CHECK-NEXT: [[TMP110:%.*]] = extractelement <32 x i1> [[TMP11]], i32 2 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP110]]) -; CHECK-NEXT: [[TMP111:%.*]] = extractelement <32 x i1> [[TMP11]], i32 3 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP111]]) -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <32 x i1> [[TMP11]], i32 4 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP112]]) -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <32 x i1> [[TMP11]], i32 5 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP113]]) -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <32 x i1> [[TMP11]], i32 6 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP114]]) -; CHECK-NEXT: [[TMP115:%.*]] = extractelement <32 x i1> [[TMP11]], i32 7 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP115]]) -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <32 x i1> [[TMP11]], i32 8 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP116]]) -; CHECK-NEXT: [[TMP117:%.*]] = extractelement <32 x i1> [[TMP11]], i32 9 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP117]]) -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <32 x i1> [[TMP11]], i32 10 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP118]]) -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <32 x i1> [[TMP11]], i32 11 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP119]]) -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <32 x i1> [[TMP11]], i32 12 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP120]]) -; CHECK-NEXT: [[TMP121:%.*]] = extractelement <32 x i1> [[TMP11]], i32 13 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP121]]) -; CHECK-NEXT: [[TMP122:%.*]] = extractelement <32 x i1> [[TMP11]], i32 14 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP122]]) -; CHECK-NEXT: [[TMP123:%.*]] = extractelement <32 x i1> [[TMP11]], i32 15 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP123]]) -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <32 x i1> [[TMP11]], i32 16 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP124]]) -; CHECK-NEXT: [[TMP125:%.*]] = extractelement <32 x i1> [[TMP11]], i32 17 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP125]]) -; CHECK-NEXT: [[TMP126:%.*]] = extractelement <32 x i1> [[TMP11]], i32 18 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP126]]) -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <32 x i1> [[TMP11]], i32 19 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP127]]) -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <32 x i1> [[TMP11]], i32 20 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP128]]) -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <32 x i1> [[TMP11]], i32 21 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP129]]) -; CHECK-NEXT: [[TMP130:%.*]] = extractelement <32 x i1> [[TMP11]], i32 22 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP130]]) -; CHECK-NEXT: [[TMP131:%.*]] = extractelement <32 x i1> [[TMP11]], i32 23 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP131]]) -; CHECK-NEXT: [[TMP132:%.*]] = extractelement <32 x i1> [[TMP11]], i32 24 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP132]]) -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <32 x i1> [[TMP11]], i32 25 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP133]]) -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <32 x i1> [[TMP11]], i32 26 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP134]]) -; CHECK-NEXT: [[TMP135:%.*]] = extractelement <32 x i1> [[TMP11]], i32 27 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP135]]) -; CHECK-NEXT: [[TMP136:%.*]] = extractelement <32 x i1> [[TMP11]], i32 28 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP136]]) -; CHECK-NEXT: [[TMP137:%.*]] = extractelement <32 x i1> [[TMP11]], i32 29 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP137]]) -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <32 x i1> [[TMP11]], i32 30 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP138]]) -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <32 x i1> [[TMP11]], i32 31 -; CHECK-NEXT: call void @llvm.assume(i1 [[TMP139]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 -1024 -; CHECK-NEXT: [[TMP140:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP140]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[CMO]], -8 -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP141]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = getelementptr nusw i8, ptr [[IV]], i64 -8 ; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[IV]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[L1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[IV]], [[END]] -; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[LOOP]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi ptr [ [[IV]], %[[LOOP]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi ptr [ [[IV]], %[[LOOP]] ] ; CHECK-NEXT: [[FINAL_LOAD:%.*]] = load i32, ptr [[IV_LCSSA]], align 4 ; CHECK-NEXT: ret i32 [[FINAL_LOAD]] ; @@ -375,9 +79,3 @@ exit: declare void @llvm.assume(i1 noundef) attributes #0 = { "target-cpu"="skylake-avx512" } -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll new file mode 100644 index 00000000000000..738836d10c5a8c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -S %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define i64 @test_pr98660(ptr %dst, i64 %N) { +; CHECK-LABEL: define i64 @test_pr98660( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 24 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP4]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i32 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP9]], i32 16 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 24 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD1]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD3]], zeroinitializer +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP13]], i32 4, <8 x i1> [[TMP17]]) +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP14]], i32 4, <8 x i1> [[TMP18]]) +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP15]], i32 4, <8 x i1> [[TMP19]]) +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <8 x i1> [[TMP20]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i64 [[N_VEC]], 1 +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[OR]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0 +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RET:%.*]] = phi i64 [ [[IV]], %[[LOOP_LATCH]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[RET]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %or = or disjoint i64 %iv, 1 + %gep = getelementptr i32, ptr %dst, i64 %or + %l = load i32, ptr %gep + %c = icmp eq i32 %l, 0 + br i1 %c, label %then, label %loop.latch + +then: + store i32 0, ptr %gep, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, %N + br i1 %ec, label %loop.header, label %exit + +exit: + %ret = phi i64 [ %iv, %loop.latch ] + ret i64 %ret +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll index e858edefd0440b..fe96463470d8d0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll @@ -83,7 +83,7 @@ for.body: %gep.D = getelementptr inbounds double, ptr %D, i64 %iv %l.D = load double, ptr %gep.D - %p.4 = call double @llvm.pow.f64(double %p.3, double %l.D) + %p.4 = call double @llvm.pow.f64(double %p.2, double %l.D) %p.5 = call double @llvm.pow.f64(double %p.4, double %p.3) %mul = fmul double 2.0, %p.5 %mul.2 = fmul double %mul, 2.0 diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index 93056ad209bf7a..bf27c146ec9ce1 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -202,7 +202,6 @@ exit: ; %iv.2 is dead in the vector loop and only used outside the loop. -; FIXME: Scalar steps for iv.2 are not removed at the moment. define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; CHECK-LABEL: @iv_2_dead_in_loop_only_used_outside ; CHECK-LABEL: vector.body: @@ -210,7 +209,7 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; VEC-NEXT: [[VEC_IND:%.+]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.+]], %vector.body ] ; CHECK: [[IV_0:%.+]] = add i64 [[INDEX]], 0 ; VEC-NOT: add i64 [[INDEX]], 1 -; CHECK: [[IV_2_0:%.+]] = add i32 %offset.idx, 0 +; CHECK-NOT: add i32 %offset.idx, 0 ; CHECK-LABEL: scalar.ph: ; CHECK-NEXT: {{.+}} = phi i64 [ 1002, %middle.block ], [ 0, %entry ] ; CHECK-NEXT: {{.+}} = phi i32 [ 2004, %middle.block ], [ 0, %entry ] diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll index 49058e443d6638..80a6bb50ca91b6 100644 --- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll +++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll @@ -42,9 +42,6 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP17:%.*]] = add nsw i64 [[TMP13]], -1 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll index 64fdefbb7cb670..c0eb4ccdd6d7e5 100644 --- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll @@ -27,15 +27,6 @@ define void @test1_pr58811() { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 0, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 2, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i32 3, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -131,15 +122,6 @@ define void @test2_pr58811() { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 0, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 2, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i32 3, [[INDUCTION_IV_LCSSA]] -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -218,15 +200,6 @@ define void @test3_pr58811() { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = mul i32 0, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 1, [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i32 2, [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 3, [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll index f3885b0b100e80..afb7d87bd17528 100644 --- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll @@ -16,10 +16,6 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IND_END]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll new file mode 100644 index 00000000000000..8983c80bf3ef4b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll @@ -0,0 +1,1744 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4-IC1 --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4-IC2 --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1-IC2 --check-prefix=CHECK + + +; int multi_user_cmp(float* a, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; for (long long i = 0; i < n; i++) { +; if (a[i] < 0.0f) { +; any = 1; +; } else { +; all = 0; +; } +; } +; return all ? 1 : any ? 2 : 3; +; } +define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC1: vector.ph: +; CHECK-VF4-IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC1: vector.body: +; CHECK-VF4-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-VF4-IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-VF4-IC1-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]] +; CHECK-VF4-IC1-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-VF4-IC1-NEXT: [[TMP6]] = or <4 x i1> [[VEC_PHI]], [[TMP5]] +; CHECK-VF4-IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4-IC1: middle.block: +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP8]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i1 false, i1 true +; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP11]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC1: scalar.ph: +; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i1 [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP13:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP12]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP13]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC2: vector.ph: +; CHECK-VF4-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC2: vector.body: +; CHECK-VF4-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4-IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-VF4-IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-VF4-IC2-NEXT: [[TMP6:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD4]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP8]] = or <4 x i1> [[VEC_PHI2]], [[TMP6]] +; CHECK-VF4-IC2-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI3]], [[TMP7]] +; CHECK-VF4-IC2-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-VF4-IC2-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-VF4-IC2-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] +; CHECK-VF4-IC2-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] +; CHECK-VF4-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4-IC2: middle.block: +; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) +; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = freeze i1 [[TMP15]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP16]], i1 false, i1 true +; CHECK-VF4-IC2-NEXT: [[BIN_RDX5:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) +; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = freeze i1 [[TMP17]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP18]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC2: scalar.ph: +; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i1 [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP20:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP19]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP20]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1-IC2: vector.ph: +; CHECK-VF1-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1-IC2: vector.body: +; CHECK-VF1-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF1-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1-IC2-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP3]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP6:%.*]] = fcmp olt float [[TMP4]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP5]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP8]] = or i1 [[VEC_PHI2]], [[TMP6]] +; CHECK-VF1-IC2-NEXT: [[TMP9]] = or i1 [[VEC_PHI3]], [[TMP7]] +; CHECK-VF1-IC2-NEXT: [[TMP10:%.*]] = xor i1 [[TMP6]], true +; CHECK-VF1-IC2-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true +; CHECK-VF1-IC2-NEXT: [[TMP12]] = or i1 [[VEC_PHI]], [[TMP10]] +; CHECK-VF1-IC2-NEXT: [[TMP13]] = or i1 [[VEC_PHI1]], [[TMP11]] +; CHECK-VF1-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF1-IC2: middle.block: +; CHECK-VF1-IC2-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP13]], [[TMP12]] +; CHECK-VF1-IC2-NEXT: [[TMP15:%.*]] = freeze i1 [[BIN_RDX]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i1 false, i1 true +; CHECK-VF1-IC2-NEXT: [[BIN_RDX4:%.*]] = or i1 [[TMP9]], [[TMP8]] +; CHECK-VF1-IC2-NEXT: [[TMP16:%.*]] = freeze i1 [[BIN_RDX4]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT5:%.*]] = select i1 [[TMP16]], i1 true, i1 false +; CHECK-VF1-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1-IC2: scalar.ph: +; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i1 [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[TMP17:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP18:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP17]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP18]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.0.off0., i32 1, i32 %0 + ret i32 %1 +} + +;int multi_user_cmp_int(int* a, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; for (long long i = 0; i < n; i++) { +; if (a[i] < 0) { +; any = 1; +; } else { +; all = 0; +; } +; } +; return all ? 1 : any ? 2 : 3; +;} +define i32 @multi_user_cmp_int(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_int( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC1: vector.ph: +; CHECK-VF4-IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC1: vector.body: +; CHECK-VF4-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-VF4-IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-VF4-IC1-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]] +; CHECK-VF4-IC1-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-VF4-IC1-NEXT: [[TMP6]] = or <4 x i1> [[VEC_PHI]], [[TMP5]] +; CHECK-VF4-IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4-IC1: middle.block: +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP8]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i1 false, i1 true +; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP11]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC1: scalar.ph: +; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i1 [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 0 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP13:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP12]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP13]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_int( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC2: vector.ph: +; CHECK-VF4-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC2: vector.body: +; CHECK-VF4-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4-IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-VF4-IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; CHECK-VF4-IC2-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD4]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP8]] = or <4 x i1> [[VEC_PHI2]], [[TMP6]] +; CHECK-VF4-IC2-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI3]], [[TMP7]] +; CHECK-VF4-IC2-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-VF4-IC2-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-VF4-IC2-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] +; CHECK-VF4-IC2-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] +; CHECK-VF4-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4-IC2: middle.block: +; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) +; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = freeze i1 [[TMP15]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP16]], i1 false, i1 true +; CHECK-VF4-IC2-NEXT: [[BIN_RDX5:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) +; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = freeze i1 [[TMP17]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP18]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC2: scalar.ph: +; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i1 [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 0 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP20:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP19]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP20]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_int( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1-IC2: vector.ph: +; CHECK-VF1-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1-IC2: vector.body: +; CHECK-VF1-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF1-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1-IC2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP6:%.*]] = icmp slt i32 [[TMP4]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP7:%.*]] = icmp slt i32 [[TMP5]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP8]] = or i1 [[VEC_PHI2]], [[TMP6]] +; CHECK-VF1-IC2-NEXT: [[TMP9]] = or i1 [[VEC_PHI3]], [[TMP7]] +; CHECK-VF1-IC2-NEXT: [[TMP10:%.*]] = xor i1 [[TMP6]], true +; CHECK-VF1-IC2-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true +; CHECK-VF1-IC2-NEXT: [[TMP12]] = or i1 [[VEC_PHI]], [[TMP10]] +; CHECK-VF1-IC2-NEXT: [[TMP13]] = or i1 [[VEC_PHI1]], [[TMP11]] +; CHECK-VF1-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF1-IC2: middle.block: +; CHECK-VF1-IC2-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP13]], [[TMP12]] +; CHECK-VF1-IC2-NEXT: [[TMP15:%.*]] = freeze i1 [[BIN_RDX]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i1 false, i1 true +; CHECK-VF1-IC2-NEXT: [[BIN_RDX4:%.*]] = or i1 [[TMP9]], [[TMP8]] +; CHECK-VF1-IC2-NEXT: [[TMP16:%.*]] = freeze i1 [[BIN_RDX4]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT5:%.*]] = select i1 [[TMP16]], i1 true, i1 false +; CHECK-VF1-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1-IC2: scalar.ph: +; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i1 [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 0 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[TMP17:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP18:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP17]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP18]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %load1, 0 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.0.off0., i32 1, i32 %0 + ret i32 %1 +} + +; int multi_user_cmp_branch_use(float* a, int *b, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; for (long long i = 0; i < n; i++) { +; _Bool c = a[i] < 0.0f; +; if (c) { +; any = 1; +; } else { +; all = 0; +; } +; if (c) +; b[i]++; +; } +; return all ? 1 : any ? 2 : 3; +; } +define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_branch_use( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], ptr [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-VF4-IC1: vector.memcheck: +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-VF4-IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-VF4-IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-VF4-IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; CHECK-VF4-IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-VF4-IC1-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC1: vector.ph: +; CHECK-VF4-IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC1: vector.body: +; CHECK-VF4-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_STORE_CONTINUE8]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[PRED_STORE_CONTINUE8]] ] +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4-IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-VF4-IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !alias.scope [[META6:![0-9]+]] +; CHECK-VF4-IC1-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]] +; CHECK-VF4-IC1-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], +; CHECK-VF4-IC1-NEXT: [[TMP7]] = or <4 x i1> [[VEC_PHI]], [[TMP6]] +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-VF4-IC1-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-VF4-IC1: pred.store.if: +; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP10]], 1 +; CHECK-VF4-IC1-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-VF4-IC1: pred.store.continue: +; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_STORE_IF]] ] +; CHECK-VF4-IC1-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-VF4-IC1-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-VF4-IC1: pred.store.if3: +; CHECK-VF4-IC1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF4-IC1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] +; CHECK-VF4-IC1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 1 +; CHECK-VF4-IC1-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK-VF4-IC1: pred.store.continue4: +; CHECK-VF4-IC1-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP16]], [[PRED_STORE_IF3]] ] +; CHECK-VF4-IC1-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-VF4-IC1-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-VF4-IC1: pred.store.if5: +; CHECK-VF4-IC1-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF4-IC1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]] +; CHECK-VF4-IC1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP22]], 1 +; CHECK-VF4-IC1-NEXT: store i32 [[TMP23]], ptr [[TMP21]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK-VF4-IC1: pred.store.continue6: +; CHECK-VF4-IC1-NEXT: [[TMP24:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE4]] ], [ [[TMP22]], [[PRED_STORE_IF5]] ] +; CHECK-VF4-IC1-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-VF4-IC1-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; CHECK-VF4-IC1: pred.store.if7: +; CHECK-VF4-IC1-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF4-IC1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP26]] +; CHECK-VF4-IC1-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: [[TMP29:%.*]] = add nsw i32 [[TMP28]], 1 +; CHECK-VF4-IC1-NEXT: store i32 [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC1-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK-VF4-IC1: pred.store.continue8: +; CHECK-VF4-IC1-NEXT: [[TMP30:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE6]] ], [ [[TMP28]], [[PRED_STORE_IF7]] ] +; CHECK-VF4-IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4-IC1-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4-IC1: middle.block: +; CHECK-VF4-IC1-NEXT: [[TMP32:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) +; CHECK-VF4-IC1-NEXT: [[TMP33:%.*]] = freeze i1 [[TMP32]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP33]], i1 false, i1 true +; CHECK-VF4-IC1-NEXT: [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4-IC1-NEXT: [[TMP35:%.*]] = freeze i1 [[TMP34]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT9:%.*]] = select i1 [[TMP35]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC1: scalar.ph: +; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i1 [ [[RDX_SELECT9]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[IF_END6]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX10]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[IF_END6]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: br i1 [[CMP1]], label [[IF_THEN3:%.*]], label [[IF_END6]] +; CHECK-VF4-IC1: if.then3: +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-VF4-IC1-NEXT: [[INC:%.*]] = add nsw i32 [[LOAD2]], 1 +; CHECK-VF4-IC1-NEXT: store i32 [[INC]], ptr [[ARRAYIDX5]], align 4 +; CHECK-VF4-IC1-NEXT: br label [[IF_END6]] +; CHECK-VF4-IC1: if.end6: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[IF_END6]] ], [ [[RDX_SELECT9]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[IF_END6]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[TMP36:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP37:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP36]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP37]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_branch_use( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], ptr [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-VF4-IC2: vector.memcheck: +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-VF4-IC2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-VF4-IC2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-VF4-IC2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; CHECK-VF4-IC2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-VF4-IC2-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC2: vector.ph: +; CHECK-VF4-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC2: vector.body: +; CHECK-VF4-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE19:%.*]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_STORE_CONTINUE19]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_STORE_CONTINUE19]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE19]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_STORE_CONTINUE19]] ] +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4-IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-VF4-IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-VF4-IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope [[META6:![0-9]+]] +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP8:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD5]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI3]], [[TMP7]] +; CHECK-VF4-IC2-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI4]], [[TMP8]] +; CHECK-VF4-IC2-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-VF4-IC2-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], +; CHECK-VF4-IC2-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI]], [[TMP11]] +; CHECK-VF4-IC2-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]] +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-VF4-IC2: pred.store.if: +; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP17]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP18]], ptr [[TMP16]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-VF4-IC2: pred.store.continue: +; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP17]], [[PRED_STORE_IF]] ] +; CHECK-VF4-IC2-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; CHECK-VF4-IC2: pred.store.if6: +; CHECK-VF4-IC2-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF4-IC2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] +; CHECK-VF4-IC2-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP23]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP24]], ptr [[TMP22]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE7]] +; CHECK-VF4-IC2: pred.store.continue7: +; CHECK-VF4-IC2-NEXT: [[TMP25:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP23]], [[PRED_STORE_IF6]] ] +; CHECK-VF4-IC2-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK-VF4-IC2: pred.store.if8: +; CHECK-VF4-IC2-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2 +; CHECK-VF4-IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP27]] +; CHECK-VF4-IC2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP30:%.*]] = add nsw i32 [[TMP29]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP30]], ptr [[TMP28]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK-VF4-IC2: pred.store.continue9: +; CHECK-VF4-IC2-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE7]] ], [ [[TMP29]], [[PRED_STORE_IF8]] ] +; CHECK-VF4-IC2-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP32]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK-VF4-IC2: pred.store.if10: +; CHECK-VF4-IC2-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 3 +; CHECK-VF4-IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]] +; CHECK-VF4-IC2-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP36:%.*]] = add nsw i32 [[TMP35]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP36]], ptr [[TMP34]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK-VF4-IC2: pred.store.continue11: +; CHECK-VF4-IC2-NEXT: [[TMP37:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE9]] ], [ [[TMP35]], [[PRED_STORE_IF10]] ] +; CHECK-VF4-IC2-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; CHECK-VF4-IC2: pred.store.if12: +; CHECK-VF4-IC2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]] +; CHECK-VF4-IC2-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP41:%.*]] = add nsw i32 [[TMP40]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP41]], ptr [[TMP39]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK-VF4-IC2: pred.store.continue13: +; CHECK-VF4-IC2-NEXT: [[TMP42:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE11]] ], [ [[TMP40]], [[PRED_STORE_IF12]] ] +; CHECK-VF4-IC2-NEXT: [[TMP43:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; CHECK-VF4-IC2: pred.store.if14: +; CHECK-VF4-IC2-NEXT: [[TMP44:%.*]] = add i64 [[INDEX]], 5 +; CHECK-VF4-IC2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP44]] +; CHECK-VF4-IC2-NEXT: [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP47:%.*]] = add nsw i32 [[TMP46]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP47]], ptr [[TMP45]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE15]] +; CHECK-VF4-IC2: pred.store.continue15: +; CHECK-VF4-IC2-NEXT: [[TMP48:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE13]] ], [ [[TMP46]], [[PRED_STORE_IF14]] ] +; CHECK-VF4-IC2-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK-VF4-IC2: pred.store.if16: +; CHECK-VF4-IC2-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 6 +; CHECK-VF4-IC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP50]] +; CHECK-VF4-IC2-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP53:%.*]] = add nsw i32 [[TMP52]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP53]], ptr [[TMP51]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK-VF4-IC2: pred.store.continue17: +; CHECK-VF4-IC2-NEXT: [[TMP54:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE15]] ], [ [[TMP52]], [[PRED_STORE_IF16]] ] +; CHECK-VF4-IC2-NEXT: [[TMP55:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; CHECK-VF4-IC2-NEXT: br i1 [[TMP55]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]] +; CHECK-VF4-IC2: pred.store.if18: +; CHECK-VF4-IC2-NEXT: [[TMP56:%.*]] = add i64 [[INDEX]], 7 +; CHECK-VF4-IC2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP56]] +; CHECK-VF4-IC2-NEXT: [[TMP58:%.*]] = load i32, ptr [[TMP57]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: [[TMP59:%.*]] = add nsw i32 [[TMP58]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[TMP59]], ptr [[TMP57]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF4-IC2-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK-VF4-IC2: pred.store.continue19: +; CHECK-VF4-IC2-NEXT: [[TMP60:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE17]] ], [ [[TMP58]], [[PRED_STORE_IF18]] ] +; CHECK-VF4-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4-IC2-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4-IC2: middle.block: +; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP14]], [[TMP13]] +; CHECK-VF4-IC2-NEXT: [[TMP62:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) +; CHECK-VF4-IC2-NEXT: [[TMP63:%.*]] = freeze i1 [[TMP62]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP63]], i1 false, i1 true +; CHECK-VF4-IC2-NEXT: [[BIN_RDX20:%.*]] = or <4 x i1> [[TMP10]], [[TMP9]] +; CHECK-VF4-IC2-NEXT: [[TMP64:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX20]]) +; CHECK-VF4-IC2-NEXT: [[TMP65:%.*]] = freeze i1 [[TMP64]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT21:%.*]] = select i1 [[TMP65]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC2: scalar.ph: +; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX22:%.*]] = phi i1 [ [[RDX_SELECT21]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[IF_END6]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX22]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[IF_END6]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN3:%.*]], label [[IF_END6]] +; CHECK-VF4-IC2: if.then3: +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-VF4-IC2-NEXT: [[INC:%.*]] = add nsw i32 [[LOAD2]], 1 +; CHECK-VF4-IC2-NEXT: store i32 [[INC]], ptr [[ARRAYIDX5]], align 4 +; CHECK-VF4-IC2-NEXT: br label [[IF_END6]] +; CHECK-VF4-IC2: if.end6: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[IF_END6]] ], [ [[RDX_SELECT21]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[IF_END6]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[TMP66:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP67:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP66]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP67]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_branch_use( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], ptr [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-VF1-IC2: vector.memcheck: +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-VF1-IC2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-VF1-IC2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-VF1-IC2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; CHECK-VF1-IC2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-VF1-IC2-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-VF1-IC2: vector.ph: +; CHECK-VF1-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1-IC2: vector.body: +; CHECK-VF1-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1-IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; CHECK-VF1-IC2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP3]], align 4, !alias.scope [[META6:![0-9]+]] +; CHECK-VF1-IC2-NEXT: [[TMP6:%.*]] = load float, ptr [[TMP4]], align 4, !alias.scope [[META6]] +; CHECK-VF1-IC2-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP5]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP8:%.*]] = fcmp olt float [[TMP6]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP9]] = or i1 [[VEC_PHI3]], [[TMP7]] +; CHECK-VF1-IC2-NEXT: [[TMP10]] = or i1 [[VEC_PHI4]], [[TMP8]] +; CHECK-VF1-IC2-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true +; CHECK-VF1-IC2-NEXT: [[TMP12:%.*]] = xor i1 [[TMP8]], true +; CHECK-VF1-IC2-NEXT: [[TMP13]] = or i1 [[VEC_PHI]], [[TMP11]] +; CHECK-VF1-IC2-NEXT: [[TMP14]] = or i1 [[VEC_PHI2]], [[TMP12]] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-VF1-IC2: pred.store.if: +; CHECK-VF1-IC2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-VF1-IC2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; CHECK-VF1-IC2-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 1 +; CHECK-VF1-IC2-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF1-IC2-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-VF1-IC2: pred.store.continue: +; CHECK-VF1-IC2-NEXT: [[TMP18:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP16]], [[PRED_STORE_IF]] ] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; CHECK-VF1-IC2: pred.store.if5: +; CHECK-VF1-IC2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]] +; CHECK-VF1-IC2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF1-IC2-NEXT: [[TMP21:%.*]] = add nsw i32 [[TMP20]], 1 +; CHECK-VF1-IC2-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-VF1-IC2-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK-VF1-IC2: pred.store.continue6: +; CHECK-VF1-IC2-NEXT: [[TMP22:%.*]] = phi i32 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP20]], [[PRED_STORE_IF5]] ] +; CHECK-VF1-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1-IC2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF1-IC2: middle.block: +; CHECK-VF1-IC2-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP14]], [[TMP13]] +; CHECK-VF1-IC2-NEXT: [[TMP24:%.*]] = freeze i1 [[BIN_RDX]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP24]], i1 false, i1 true +; CHECK-VF1-IC2-NEXT: [[BIN_RDX7:%.*]] = or i1 [[TMP10]], [[TMP9]] +; CHECK-VF1-IC2-NEXT: [[TMP25:%.*]] = freeze i1 [[BIN_RDX7]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT8:%.*]] = select i1 [[TMP25]], i1 true, i1 false +; CHECK-VF1-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1-IC2: scalar.ph: +; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i1 [ [[RDX_SELECT8]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[IF_END6]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX9]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[IF_END6]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN3:%.*]], label [[IF_END6]] +; CHECK-VF1-IC2: if.then3: +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-VF1-IC2-NEXT: [[INC:%.*]] = add nsw i32 [[LOAD2]], 1 +; CHECK-VF1-IC2-NEXT: store i32 [[INC]], ptr [[ARRAYIDX5]], align 4 +; CHECK-VF1-IC2-NEXT: br label [[IF_END6]] +; CHECK-VF1-IC2: if.end6: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[IF_END6]] ], [ [[RDX_SELECT8]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[IF_END6]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[TMP26:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP27:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP26]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP27]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end6 ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %if.end6 ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %if.end6 ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + br i1 %cmp1, label %if.then3, label %if.end6 + +if.then3: + %arrayidx5 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %load2 = load i32, ptr %arrayidx5, align 4 + %inc = add nsw i32 %load2, 1 + store i32 %inc, ptr %arrayidx5, align 4 + br label %if.end6 + +if.end6: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.0.off0., i32 1, i32 %0 + ret i32 %1 +} + +; int multi_user_cmp_branch_use_and_outside_bb_use(float* a, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; _Bool c; +; for (long long i = 0; i < n; i++) { +; c = a[i] < 0.0f; +; if (c) { +; any = 1; +; } else { +; all = 0; +; } +; } +; return all ? c : any ? 2 : 3; +; } +define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_branch_use_and_outside_bb_use( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC1: vector.ph: +; CHECK-VF4-IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4-IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC1: vector.body: +; CHECK-VF4-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-VF4-IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-VF4-IC1-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC1-NEXT: [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]] +; CHECK-VF4-IC1-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; CHECK-VF4-IC1-NEXT: [[TMP6]] = or <4 x i1> [[VEC_PHI]], [[TMP5]] +; CHECK-VF4-IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-VF4-IC1: middle.block: +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = freeze i1 [[TMP9]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true +; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = freeze i1 [[TMP11]] +; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC1: scalar.ph: +; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i1 [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[CMP1_LCSSA:%.*]] = phi i1 [ [[CMP1]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT2]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC1-NEXT: [[TMP13:%.*]] = zext i1 [[CMP1_LCSSA]] to i32 +; CHECK-VF4-IC1-NEXT: [[TMP14:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP15:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 [[TMP13]], i32 [[TMP14]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP15]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_branch_use_and_outside_bb_use( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF4-IC2: vector.ph: +; CHECK-VF4-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-VF4-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF4-IC2: vector.body: +; CHECK-VF4-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF4-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF4-IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-VF4-IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-VF4-IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-VF4-IC2-NEXT: [[TMP6:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD4]], zeroinitializer +; CHECK-VF4-IC2-NEXT: [[TMP8]] = or <4 x i1> [[VEC_PHI2]], [[TMP6]] +; CHECK-VF4-IC2-NEXT: [[TMP9]] = or <4 x i1> [[VEC_PHI3]], [[TMP7]] +; CHECK-VF4-IC2-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-VF4-IC2-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; CHECK-VF4-IC2-NEXT: [[TMP12]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] +; CHECK-VF4-IC2-NEXT: [[TMP13]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] +; CHECK-VF4-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-VF4-IC2: middle.block: +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 +; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] +; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) +; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP17]], i1 false, i1 true +; CHECK-VF4-IC2-NEXT: [[BIN_RDX5:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) +; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] +; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF4-IC2: scalar.ph: +; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i1 [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[CMP1_LCSSA:%.*]] = phi i1 [ [[CMP1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT6]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF4-IC2-NEXT: [[TMP20:%.*]] = zext i1 [[CMP1_LCSSA]] to i32 +; CHECK-VF4-IC2-NEXT: [[TMP21:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP22:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 [[TMP20]], i32 [[TMP21]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP22]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_branch_use_and_outside_bb_use( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1-IC2: vector.ph: +; CHECK-VF1-IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1-IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1-IC2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF1-IC2: vector.body: +; CHECK-VF1-IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; CHECK-VF1-IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] +; CHECK-VF1-IC2-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP3]], align 4 +; CHECK-VF1-IC2-NEXT: [[TMP6:%.*]] = fcmp olt float [[TMP4]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP5]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[TMP8]] = or i1 [[VEC_PHI2]], [[TMP6]] +; CHECK-VF1-IC2-NEXT: [[TMP9]] = or i1 [[VEC_PHI3]], [[TMP7]] +; CHECK-VF1-IC2-NEXT: [[TMP10:%.*]] = xor i1 [[TMP6]], true +; CHECK-VF1-IC2-NEXT: [[TMP11:%.*]] = xor i1 [[TMP7]], true +; CHECK-VF1-IC2-NEXT: [[TMP12]] = or i1 [[VEC_PHI]], [[TMP10]] +; CHECK-VF1-IC2-NEXT: [[TMP13]] = or i1 [[VEC_PHI1]], [[TMP11]] +; CHECK-VF1-IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-VF1-IC2: middle.block: +; CHECK-VF1-IC2-NEXT: [[BIN_RDX:%.*]] = or i1 [[TMP13]], [[TMP12]] +; CHECK-VF1-IC2-NEXT: [[TMP15:%.*]] = freeze i1 [[BIN_RDX]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i1 false, i1 true +; CHECK-VF1-IC2-NEXT: [[BIN_RDX4:%.*]] = or i1 [[TMP9]], [[TMP8]] +; CHECK-VF1-IC2-NEXT: [[TMP16:%.*]] = freeze i1 [[BIN_RDX4]] +; CHECK-VF1-IC2-NEXT: [[RDX_SELECT5:%.*]] = select i1 [[TMP16]], i1 true, i1 false +; CHECK-VF1-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF1-IC2: scalar.ph: +; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i1 [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[CMP1_LCSSA:%.*]] = phi i1 [ [[CMP1]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ], [ [[RDX_SELECT5]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1-IC2-NEXT: [[TMP17:%.*]] = zext i1 [[CMP1_LCSSA]] to i32 +; CHECK-VF1-IC2-NEXT: [[TMP18:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP19:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 [[TMP17]], i32 [[TMP18]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP19]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = zext i1 %cmp1 to i32 + %1 = select i1 %.any.0.off0, i32 2, i32 3 + %2 = select i1 %all.0.off0., i32 %0, i32 %1 + ret i32 %2 +} + +; Currently, this test-case is not supported. +; int multi_user_cmp_fmax(float* a, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; float max = -INFINITY; +; for (long long i = 0; i < n; i++) { +; _Bool c = a[i] > max; +; if (c) { +; max = a[i]; +; any = 1; +; } else { +; all = 0; +; } +; } +; return all ? 1 : any ? 2 : 3; +; } +define i32 @multi_user_cmp_fmax(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_fmax( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[MAX_015:%.*]] = phi float [ 0xFFF0000000000000, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp ogt float [[LOAD1]], [[MAX_015]] +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[DOTMAX_0]] = select i1 [[CMP1]], float [[LOAD1]], float [[MAX_015]] +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_fmax( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[MAX_015:%.*]] = phi float [ 0xFFF0000000000000, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp ogt float [[LOAD1]], [[MAX_015]] +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[DOTMAX_0]] = select i1 [[CMP1]], float [[LOAD1]], float [[MAX_015]] +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_fmax( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[MAX_015:%.*]] = phi float [ 0xFFF0000000000000, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp ogt float [[LOAD1]], [[MAX_015]] +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[DOTMAX_0]] = select i1 [[CMP1]], float [[LOAD1]], float [[MAX_015]] +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP1]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %max.015 = phi float [ 0xFFF0000000000000, %entry ], [ %.max.0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp ogt float %load1, %max.015 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %.max.0 = select i1 %cmp1, float %load1, float %max.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.0.off0., i32 1, i32 %0 + ret i32 %1 +} + +; Currently, this test-case is not supported. +; int multi_user_cmp_max(int* a, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; int max = 0; +; for (long long i = 0; i < n; i++) { +; _Bool c = a[i] > max; +; if (c) { +; max = a[i]; +; any = 1; +; } else { +; all = 0; +; } +; } +; return all ? 1 : any ? 2 : 3; +; } +define i32 @multi_user_cmp_max(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_max( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[MAX_015:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LOAD1]], [[MAX_015]] +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[DOTMAX_0]] = tail call i32 @llvm.smax.i32(i32 [[LOAD1]], i32 [[MAX_015]]) +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_max( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[MAX_015:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LOAD1]], [[MAX_015]] +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[DOTMAX_0]] = tail call i32 @llvm.smax.i32(i32 [[LOAD1]], i32 [[MAX_015]]) +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_max( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[MAX_015:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[DOTMAX_0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LOAD1]], [[MAX_015]] +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[DOTMAX_0]] = tail call i32 @llvm.smax.i32(i32 [[LOAD1]], i32 [[MAX_015]]) +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP1]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %max.015 = phi i32 [ 0, %entry ], [ %.max.0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp sgt i32 %load1, %max.015 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %.max.0 = tail call i32 @llvm.smax.i32(i32 %load1, i32 %max.015) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %.any.0.off0.lcssa = phi i1 [ %.any.0.off0, %for.body ] + %all.0.off0..lcssa = phi i1 [ %all.0.off0., %for.body ] + %0 = select i1 %.any.0.off0.lcssa, i32 2, i32 3 + %1 = select i1 %all.0.off0..lcssa, i32 1, i32 %0 + ret i32 %1 +} + +declare i32 @llvm.smax.i32(i32, i32) + +; Currently, this test-case is not supported. +; int multi_user_cmp_use_store_offset(float* a, int *b, long long n) { +; _Bool any = 0; +; _Bool all = 1; +; for (long long i = 0; i < n; i++) { +; _Bool c = a[i] < 0.0f; +; if (c) { +; any = 1; +; } else { +; all = 0; +; } +; b[i+c] = any; +; } +; return all ? 1 : any ? 2 : 3; +; } +define i32 @multi_user_cmp_use_store_offset(ptr readonly %a, ptr writeonly %b, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_use_store_offset( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], ptr writeonly [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[CONV4:%.*]] = zext i1 [[CMP1]] to i32 +; CHECK-VF4-IC1-NEXT: [[N32:%.*]] = trunc i64 [[N]] to i32 +; CHECK-VF4-IC1-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV4]], [[N32]] +; CHECK-VF4-IC1-NEXT: [[IDXPROM5:%.*]] = zext nneg i32 [[ADD]] to i64 +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM5]] +; CHECK-VF4-IC1-NEXT: store i32 [[CONV4]], ptr [[ARRAYIDX6]], align 4 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_use_store_offset( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], ptr writeonly [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[CONV4:%.*]] = zext i1 [[CMP1]] to i32 +; CHECK-VF4-IC2-NEXT: [[N32:%.*]] = trunc i64 [[N]] to i32 +; CHECK-VF4-IC2-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV4]], [[N32]] +; CHECK-VF4-IC2-NEXT: [[IDXPROM5:%.*]] = zext nneg i32 [[ADD]] to i64 +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM5]] +; CHECK-VF4-IC2-NEXT: store i32 [[CONV4]], ptr [[ARRAYIDX6]], align 4 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP1]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_use_store_offset( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], ptr writeonly [[B:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[CONV4:%.*]] = zext i1 [[CMP1]] to i32 +; CHECK-VF1-IC2-NEXT: [[N32:%.*]] = trunc i64 [[N]] to i32 +; CHECK-VF1-IC2-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV4]], [[N32]] +; CHECK-VF1-IC2-NEXT: [[IDXPROM5:%.*]] = zext nneg i32 [[ADD]] to i64 +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM5]] +; CHECK-VF1-IC2-NEXT: store i32 [[CONV4]], ptr [[ARRAYIDX6]], align 4 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP0]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP1]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %conv4 = zext i1 %cmp1 to i32 + %n32 = trunc i64 %n to i32 + %add = add nuw nsw i32 %conv4, %n32 + %idxprom5 = zext nneg i32 %add to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %b, i64 %idxprom5 + store i32 %conv4, ptr %arrayidx6, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %0 = select i1 %.any.0.off0, i32 2, i32 3 + %1 = select i1 %all.0.off0., i32 1, i32 %0 + ret i32 %1 +} + +; Not vectorising, compare instruction user %0 inside the loop +define i32 @multi_user_cmp_no_vectorise(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_no_vectorise( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = sext i1 [[CMP1]] to i64 +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP3:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP2]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP3]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_no_vectorise( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = sext i1 [[CMP1]] to i64 +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP3:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP2]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP3]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_no_vectorise( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = sext i1 [[CMP1]] to i64 +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP3:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP2]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP3]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %0 = sext i1 %cmp1 to i64 + %1 = add i64 %0, %indvars.iv + %indvars.iv.next = add nuw nsw i64 %1, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %2 = select i1 %.any.0.off0, i32 2, i32 3 + %3 = select i1 %all.0.off0., i32 1, i32 %2 + ret i32 %3 +} + +; Not vectorising, non recurrent select instrction %0 inside the loop +define i32 @multi_user_cmp_extra_select(ptr readonly %a, i64 noundef %n) { +; CHECK-VF4-IC1-LABEL: define i32 @multi_user_cmp_extra_select( +; CHECK-VF4-IC1-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC1-NEXT: entry: +; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC1: for.body: +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC1-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC1-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[TMP0:%.*]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC1: exit: +; CHECK-VF4-IC1-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC1-NEXT: [[TMP1:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC1-NEXT: [[TMP2:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP1]] +; CHECK-VF4-IC1-NEXT: ret i32 [[TMP2]] +; +; CHECK-VF4-IC2-LABEL: define i32 @multi_user_cmp_extra_select( +; CHECK-VF4-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF4-IC2-NEXT: entry: +; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF4-IC2: for.body: +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF4-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[TMP0:%.*]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF4-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF4-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF4-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF4-IC2: exit: +; CHECK-VF4-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF4-IC2-NEXT: [[TMP1:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF4-IC2-NEXT: [[TMP2:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP1]] +; CHECK-VF4-IC2-NEXT: ret i32 [[TMP2]] +; +; CHECK-VF1-IC2-LABEL: define i32 @multi_user_cmp_extra_select( +; CHECK-VF1-IC2-SAME: ptr readonly [[A:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-VF1-IC2-NEXT: entry: +; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF1-IC2: for.body: +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF010:%.*]] = phi i1 [ true, [[ENTRY]] ], [ [[ALL_0_OFF0_:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ANY_0_OFF09:%.*]] = phi i1 [ false, [[ENTRY]] ], [ [[DOTANY_0_OFF0:%.*]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-VF1-IC2-NEXT: [[LOAD1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1-IC2-NEXT: [[CMP1:%.*]] = fcmp olt float [[LOAD1]], 0.000000e+00 +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0]] = select i1 [[CMP1]], i1 true, i1 [[ANY_0_OFF09]] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0_]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[TMP0:%.*]] = select i1 [[CMP1]], i1 [[ALL_0_OFF010]], i1 false +; CHECK-VF1-IC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-VF1-IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-VF1-IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK-VF1-IC2: exit: +; CHECK-VF1-IC2-NEXT: [[DOTANY_0_OFF0_LCSSA:%.*]] = phi i1 [ [[DOTANY_0_OFF0]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[ALL_0_OFF0__LCSSA:%.*]] = phi i1 [ [[ALL_0_OFF0_]], [[FOR_BODY]] ] +; CHECK-VF1-IC2-NEXT: [[TMP1:%.*]] = select i1 [[DOTANY_0_OFF0_LCSSA]], i32 2, i32 3 +; CHECK-VF1-IC2-NEXT: [[TMP2:%.*]] = select i1 [[ALL_0_OFF0__LCSSA]], i32 1, i32 [[TMP1]] +; CHECK-VF1-IC2-NEXT: ret i32 [[TMP2]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %all.0.off010 = phi i1 [ true, %entry ], [ %all.0.off0., %for.body ] + %any.0.off09 = phi i1 [ false, %entry ], [ %.any.0.off0, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %load1 = load float, ptr %arrayidx, align 4 + %cmp1 = fcmp olt float %load1, 0.000000e+00 + %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09 + %all.0.off0. = select i1 %cmp1, i1 %all.0.off010, i1 false + %0 = select i1 %cmp1, i1 %all.0.off010, i1 false + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %1 = select i1 %.any.0.off0, i32 2, i32 3 + %2 = select i1 %all.0.off0., i32 1, i32 %1 + ret i32 %2 +} +;. +; CHECK-VF4-IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4-IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4-IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4-IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4-IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4-IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-VF4-IC1: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK-VF4-IC1: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK-VF4-IC1: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK-VF4-IC1: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK-VF4-IC1: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK-VF4-IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK-VF4-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK-VF4-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK-VF4-IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} +;. +; CHECK-VF4-IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4-IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4-IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4-IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4-IC2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4-IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-VF4-IC2: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK-VF4-IC2: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK-VF4-IC2: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK-VF4-IC2: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK-VF4-IC2: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK-VF4-IC2: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK-VF4-IC2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK-VF4-IC2: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK-VF4-IC2: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]} +;. +; CHECK-VF1-IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF1-IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF1-IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF1-IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK-VF1-IC2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF1-IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK-VF1-IC2: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK-VF1-IC2: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK-VF1-IC2: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK-VF1-IC2: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK-VF1-IC2: [[META10]] = distinct !{[[META10]], [[META8]]} +; CHECK-VF1-IC2: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK-VF1-IC2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK-VF1-IC2: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK-VF1-IC2: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 993b56a05207be..da0f7283d80d5b 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -272,33 +272,6 @@ exit: ; preds = %for.body } -; We don't support select/cmp reduction patterns where there is more than one -; use of the icmp/fcmp. -define i32 @select_const_i32_from_icmp_mul_use(ptr nocapture readonly %v1, ptr %v2, i64 %n) { -; CHECK-LABEL: @select_const_i32_from_icmp_mul_use -; CHECK-NOT: vector.body -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %0 = phi i64 [ 0, %entry ], [ %8, %for.body ] - %1 = phi i32 [ 3, %entry ], [ %6, %for.body ] - %2 = phi i32 [ 0, %entry ], [ %7, %for.body ] - %3 = getelementptr inbounds i32, ptr %v1, i64 %0 - %4 = load i32, ptr %3, align 4 - %5 = icmp eq i32 %4, 3 - %6 = select i1 %5, i32 %1, i32 7 - %7 = zext i1 %5 to i32 - %8 = add nuw nsw i64 %0, 1 - %9 = icmp eq i64 %8, %n - br i1 %9, label %exit, label %for.body - -exit: ; preds = %for.body - store i32 %7, ptr %v2, align 4 - ret i32 %6 -} - - ; We don't support selecting loop-variant values. define i32 @select_variant_i32_from_icmp(ptr nocapture readonly %v1, ptr nocapture readonly %v2, i64 %n) { ; CHECK-LABEL: @select_variant_i32_from_icmp diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-zero-size.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-zero-size.ll index 33f7f530f6f995..e81044defbb3af 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-zero-size.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-zero-size.ll @@ -34,3 +34,18 @@ define void @pr64886(i64 %len, ptr noalias %p) { call void @llvm.memcpy.p0.p0.i64(ptr inttoptr (i64 -1 to ptr), ptr %p, i64 poison, i1 false) ret void } + +define void @pr98610(ptr %p, ptr noalias %p2) { +; CHECK-LABEL: @pr98610( +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 0, i64 1, i1 false) +; CHECK-NEXT: [[ZERO_EXT:%.*]] = zext i32 0 to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ZERO_EXT]], 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[P]], ptr [[P2:%.*]], i64 [[MUL]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 1, i1 false) + %zero.ext = zext i32 0 to i64 + %mul = mul i64 %zero.ext, 1 + call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %p2, i64 %mul, i1 false) + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-dbgloc.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-dbgloc.ll index f8536aba2d19a9..3f577f09ada1eb 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-dbgloc.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-dbgloc.ll @@ -8,15 +8,20 @@ declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) define void @test_constant(i64 %src_size, ptr %dst, i64 %dst_size, i8 %c) !dbg !5 { -; CHECK-LABEL: @test_constant( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]], !dbg [[DBG11:![0-9]+]] +; CHECK-LABEL: define void @test_constant( +; CHECK-SAME: i64 [[SRC_SIZE:%.*]], ptr [[DST:%.*]], i64 [[DST_SIZE:%.*]], i8 [[C:%.*]]) !dbg [[DBG5:![0-9]+]] { +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE]], [[SRC_SIZE]], !dbg [[DBG11:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]], !dbg [[DBG11]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]], !dbg [[DBG11]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]], !dbg [[DBG11]] -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false), !dbg [[DBG11]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SRC_SIZE]], !dbg [[DBG11]] +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP4]], i8 [[C]], i64 [[TMP3]], i1 false), !dbg [[DBG11]] ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr @C, i64 [[SRC_SIZE]], i1 false), !dbg [[DBG12:![0-9]+]] ; CHECK-NEXT: ret void, !dbg [[DBG13:![0-9]+]] ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false), !dbg !11 call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr @C, i64 %src_size, i1 false), !dbg !12 ret void, !dbg !13 diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll index 72a234091f79b0..68356397423507 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll @@ -7,7 +7,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define void @test_constant(i64 %src_size, ptr %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_constant( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]] @@ -15,6 +17,8 @@ define void @test_constant(i64 %src_size, ptr %dst, i64 %dst_size, i8 %c) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr @C, i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr @C, i64 %src_size, i1 false) ret void @@ -22,7 +26,9 @@ define void @test_constant(i64 %src_size, ptr %dst, i64 %dst_size, i8 %c) { define void @test(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]] @@ -30,6 +36,8 @@ define void @test(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 % ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) ret void @@ -37,8 +45,10 @@ define void @test(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 % define void @test_different_types_i32_i64(ptr noalias %dst, ptr %src, i32 %dst_size, i64 %src_size, i8 %c) { ; CHECK-LABEL: @test_different_types_i32_i64( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[DST_SIZE:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[TMP1]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[TMP1]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]] @@ -46,6 +56,8 @@ define void @test_different_types_i32_i64(ptr noalias %dst, ptr %src, i32 %dst_s ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i32(ptr %dst, i8 %c, i32 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) ret void @@ -53,7 +65,9 @@ define void @test_different_types_i32_i64(ptr noalias %dst, ptr %src, i32 %dst_s define void @test_different_types_i128_i32(ptr noalias %dst, ptr %src, i128 %dst_size, i32 %src_size, i8 %c) { ; CHECK-LABEL: @test_different_types_i128_i32( -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SRC_SIZE:%.*]] to i128 +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i32 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SRC_SIZE]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i128 [[DST_SIZE:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub i128 [[DST_SIZE]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i128 0, i128 [[TMP3]] @@ -62,6 +76,8 @@ define void @test_different_types_i128_i32(ptr noalias %dst, ptr %src, i128 %dst ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr [[DST]], ptr [[SRC:%.*]], i32 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i32 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i128(ptr %dst, i8 %c, i128 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 %src_size, i1 false) ret void @@ -69,8 +85,10 @@ define void @test_different_types_i128_i32(ptr noalias %dst, ptr %src, i128 %dst define void @test_different_types_i32_i128(ptr noalias %dst, ptr %src, i32 %dst_size, i128 %src_size, i8 %c) { ; CHECK-LABEL: @test_different_types_i32_i128( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i128 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[DST_SIZE:%.*]] to i128 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i128 [[TMP1]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i128 [[TMP1]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = sub i128 [[TMP1]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i128 0, i128 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST:%.*]], i128 [[SRC_SIZE]] @@ -78,6 +96,8 @@ define void @test_different_types_i32_i128(ptr noalias %dst, ptr %src, i32 %dst_ ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i128(ptr [[DST]], ptr [[SRC:%.*]], i128 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i128 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i32(ptr %dst, i8 %c, i32 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i128(ptr %dst, ptr %src, i128 %src_size, i1 false) ret void @@ -85,7 +105,9 @@ define void @test_different_types_i32_i128(ptr noalias %dst, ptr %src, i32 %dst_ define void @test_different_types_i64_i32(ptr noalias %dst, ptr %src, i64 %dst_size, i32 %src_size, i8 %c) { ; CHECK-LABEL: @test_different_types_i64_i32( -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SRC_SIZE:%.*]] to i64 +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i32 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SRC_SIZE]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[DST_SIZE]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP3]] @@ -94,6 +116,8 @@ define void @test_different_types_i64_i32(ptr noalias %dst, ptr %src, i64 %dst_s ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr [[DST]], ptr [[SRC:%.*]], i32 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i32 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 %src_size, i1 false) ret void @@ -146,14 +170,18 @@ define void @test_align_memcpy(ptr %src, ptr noalias %dst, i64 %dst_size) { define void @test_non_i8_dst_type(ptr %src, i64 %src_size, ptr noalias %dst_pi64, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_non_i8_dst_type( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST_PI64:%.*]], i64 [[SRC_SIZE]] ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST_PI64:%.*]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST_PI64]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst_pi64, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst_pi64, ptr %src, i64 %src_size, i1 false) ret void @@ -161,10 +189,14 @@ define void @test_non_i8_dst_type(ptr %src, i64 %src_size, ptr noalias %dst_pi64 define void @test_different_dst(ptr noalias %dst2, ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size) { ; CHECK-LABEL: @test_different_dst( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST:%.*]], i8 0, i64 [[DST_SIZE:%.*]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[SRC:%.*]], i64 [[SRC_SIZE:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst2, ptr %src, i64 %src_size, i1 false) ret void @@ -206,11 +238,15 @@ define void @test_intermediate_write(ptr %b) #0 { define void @test_throwing_call(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_throwing_call( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST:%.*]], i8 [[C:%.*]], i64 [[DST_SIZE:%.*]], i1 false) -; CHECK-NEXT: call void @call() #[[ATTR2:[0-9]+]] -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE:%.*]], i1 false) +; CHECK-NEXT: call void @call() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @call() readnone call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) @@ -220,8 +256,10 @@ define void @test_throwing_call(ptr %src, i64 %src_size, ptr noalias %dst, i64 % define void @test_throwing_call_alloca(ptr %src, i64 %src_size, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_throwing_call_alloca( ; CHECK-NEXT: [[DST:%.*]] = alloca i8, align 1 -; CHECK-NEXT: call void @call() #[[ATTR2]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: call void @call() #[[ATTR3]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SRC_SIZE]] @@ -230,6 +268,8 @@ define void @test_throwing_call_alloca(ptr %src, i64 %src_size, i64 %dst_size, i ; CHECK-NEXT: ret void ; %dst = alloca i8 + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @call() readnone call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) @@ -240,10 +280,14 @@ define void @test_throwing_call_alloca(ptr %src, i64 %src_size, i64 %dst_size, i ; is not legal. define void @test_missing_noalias(ptr %src, i64 %src_size, ptr %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_missing_noalias( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST:%.*]], i8 [[C:%.*]], i64 [[DST_SIZE:%.*]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) ret void @@ -261,9 +305,13 @@ define void @test_same_const_size(ptr noalias %src, ptr noalias %dst, i8 %c) { define void @test_same_dynamic_size(ptr noalias %src, ptr noalias %dst, i64 %size, i8 %c) { ; CHECK-LABEL: @test_same_dynamic_size( -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %size, i1 false) ret void @@ -286,9 +334,11 @@ define void @test_must_alias_same_size(ptr noalias %src, ptr noalias %dst, i8 %c define void @test_must_alias_different_size(ptr noalias %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_must_alias_different_size( +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 16 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[DST]], i64 16 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[SRC_SIZE]] @@ -296,6 +346,8 @@ define void @test_must_alias_different_size(ptr noalias %src, i64 %src_size, ptr ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[GEP2]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) %gep1 = getelementptr i8, ptr %dst, i64 16 call void @llvm.memset.p0.i64(ptr %gep1, i8 %c, i64 %dst_size, i1 false) %gep2 = getelementptr i8, ptr %dst, i64 16 @@ -305,14 +357,18 @@ define void @test_must_alias_different_size(ptr noalias %src, i64 %src_size, ptr define void @test_weird_element_type(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_weird_element_type( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]] -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP5]], i8 [[C:%.*]], i64 [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[SRC_SIZE]] +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p0.i64(ptr %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %src_size, i1 false) ret void @@ -320,7 +376,9 @@ define void @test_weird_element_type(ptr %src, i64 %src_size, ptr noalias %dst, define void @test_addrspace(ptr addrspace(1) %src, i64 %src_size, ptr addrspace(1) noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_addrspace( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] +; CHECK-NEXT: [[NON_ZERO:%.*]] = icmp ne i64 [[SRC_SIZE:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[NON_ZERO]]) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(1) [[DST:%.*]], i64 [[SRC_SIZE]] @@ -328,6 +386,8 @@ define void @test_addrspace(ptr addrspace(1) %src, i64 %src_size, ptr addrspace( ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false) ; CHECK-NEXT: ret void ; + %non.zero = icmp ne i64 %src_size, 0 + call void @llvm.assume(i1 %non.zero) call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %c, i64 %dst_size, i1 false) call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %src_size, i1 false) ret void diff --git a/llvm/test/Transforms/MemCpyOpt/opaque-ptr.ll b/llvm/test/Transforms/MemCpyOpt/opaque-ptr.ll index dbd78604398f0c..a968195623780d 100644 --- a/llvm/test/Transforms/MemCpyOpt/opaque-ptr.ll +++ b/llvm/test/Transforms/MemCpyOpt/opaque-ptr.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=memcpyopt -S %s -verify-memoryssa | FileCheck %s -define void @test_memset_memcpy(ptr %src, i64 %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { +define void @test_memset_memcpy(ptr %src, i64 range(i64 1, 42) %src_size, ptr noalias %dst, i64 %dst_size, i8 %c) { ; CHECK-LABEL: @test_memset_memcpy( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll index 011d51600b51f0..c2ed7b9c845233 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll @@ -88,20 +88,10 @@ define <4 x i64> @x86_pblendvb_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b, <4 x i64> ; CHECK-NEXT: [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <8 x i32> ; CHECK-NEXT: [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <8 x i32> ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <8 x i32> [[C_BC]], [[D_BC]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP5]], <4 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[TMP11]], <4 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = bitcast <8 x i32> [[TMP13]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[TMP2]], <8 x i32> [[TMP1]] +; CHECK-NEXT: [[RES:%.*]] = bitcast <8 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %a.bc = bitcast <4 x i64> %a to <32 x i8> @@ -129,20 +119,10 @@ define <4 x i64> @x86_pblendvb_v16i16_v8i16(<4 x i64> %a, <4 x i64> %b, <4 x i64 ; CHECK-NEXT: [[C_BC:%.*]] = bitcast <4 x i64> [[C:%.*]] to <16 x i16> ; CHECK-NEXT: [[D_BC:%.*]] = bitcast <4 x i64> [[D:%.*]] to <16 x i16> ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <16 x i16> [[C_BC]], [[D_BC]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[CMP]], <16 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP5]], <8 x i16> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i1> [[CMP]], <16 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[A]] to <16 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[B]] to <16 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> [[TMP11]], <8 x i16> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[RES:%.*]] = bitcast <16 x i16> [[TMP13]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i64> [[B:%.*]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[TMP2]], <16 x i16> [[TMP1]] +; CHECK-NEXT: [[RES:%.*]] = bitcast <16 x i16> [[TMP3]] to <4 x i64> ; CHECK-NEXT: ret <4 x i64> [[RES]] ; %a.bc = bitcast <4 x i64> %a to <32 x i8> @@ -276,20 +256,10 @@ define <8 x i64> @x86_pblendvb_v16i32_v8i32(<8 x i64> %a, <8 x i64> %b, <8 x i64 ; CHECK-NEXT: [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32> ; CHECK-NEXT: [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32> ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <16 x i32> [[C_BC]], [[D_BC]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[CMP]], <16 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i1> [[CMP]], <16 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP11]], <8 x i32> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[RES:%.*]] = bitcast <16 x i32> [[TMP13]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[CMP]], <16 x i32> [[TMP2]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[RES:%.*]] = bitcast <16 x i32> [[TMP3]] to <8 x i64> ; CHECK-NEXT: ret <8 x i64> [[RES]] ; %a.bc = bitcast <8 x i64> %a to <64 x i8> @@ -317,20 +287,10 @@ define <8 x i64> @x86_pblendvb_v32i16_v16i16(<8 x i64> %a, <8 x i64> %b, <8 x i6 ; CHECK-NEXT: [[C_BC:%.*]] = bitcast <8 x i64> [[C:%.*]] to <32 x i16> ; CHECK-NEXT: [[D_BC:%.*]] = bitcast <8 x i64> [[D:%.*]] to <32 x i16> ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <32 x i16> [[C_BC]], [[D_BC]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[A:%.*]] to <32 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i64> [[B:%.*]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[TMP5]], <16 x i16> [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i64> [[A]] to <32 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[B]] to <32 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> [[TMP11]], <16 x i16> [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[TMP12]], <32 x i32> -; CHECK-NEXT: [[RES:%.*]] = bitcast <32 x i16> [[TMP13]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> [[A:%.*]] to <32 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[B:%.*]] to <32 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = select <32 x i1> [[CMP]], <32 x i16> [[TMP2]], <32 x i16> [[TMP1]] +; CHECK-NEXT: [[RES:%.*]] = bitcast <32 x i16> [[TMP3]] to <8 x i64> ; CHECK-NEXT: ret <8 x i64> [[RES]] ; %a.bc = bitcast <8 x i64> %a to <64 x i8> diff --git a/llvm/test/Transforms/PhaseOrdering/pr98799-inline-simplifycfg-ub.ll b/llvm/test/Transforms/PhaseOrdering/pr98799-inline-simplifycfg-ub.ll new file mode 100644 index 00000000000000..17073fa1982025 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/pr98799-inline-simplifycfg-ub.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=inline,simplifycfg -S | FileCheck --check-prefix=CUSTOM %s +; RUN: opt < %s -O2 -S | FileCheck --check-prefix=O2 %s + +define internal ptr @bar(ptr %arg, i1 %arg1) { +bb: + br i1 %arg1, label %bb4, label %bb2 + +bb2: + %i = load ptr, ptr %arg, align 8 + %i3 = getelementptr inbounds i8, ptr %i, i64 1 + store ptr %i3, ptr %arg, align 8 + br label %bb4 + +bb4: + %i5 = phi ptr [ %i, %bb2 ], [ null, %bb ] + ret ptr %i5 +} + +define i32 @foo(ptr %arg, i1 %arg1) { +; CUSTOM-LABEL: define i32 @foo( +; CUSTOM-SAME: ptr [[ARG:%.*]], i1 [[ARG1:%.*]]) { +; CUSTOM-NEXT: [[BB:.*:]] +; CUSTOM-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1]], true +; CUSTOM-NEXT: call void @llvm.assume(i1 [[TMP0]]) +; CUSTOM-NEXT: [[I_I:%.*]] = load ptr, ptr [[ARG]], align 8 +; CUSTOM-NEXT: [[I3_I:%.*]] = getelementptr inbounds i8, ptr [[I_I]], i64 1 +; CUSTOM-NEXT: store ptr [[I3_I]], ptr [[ARG]], align 8 +; CUSTOM-NEXT: [[I2:%.*]] = icmp ne ptr [[I_I]], null +; CUSTOM-NEXT: call void @llvm.assume(i1 [[I2]]) +; CUSTOM-NEXT: [[I3:%.*]] = load i32, ptr [[I_I]], align 4 +; CUSTOM-NEXT: ret i32 [[I3]] +; +; O2-LABEL: define i32 @foo( +; O2-SAME: ptr nocapture [[ARG:%.*]], i1 [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; O2-NEXT: [[BB:.*:]] +; O2-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1]], true +; O2-NEXT: tail call void @llvm.assume(i1 [[TMP0]]) +; O2-NEXT: [[I_I:%.*]] = load ptr, ptr [[ARG]], align 8, !nonnull [[META0:![0-9]+]], !noundef [[META0]] +; O2-NEXT: [[I3_I:%.*]] = getelementptr inbounds i8, ptr [[I_I]], i64 1 +; O2-NEXT: store ptr [[I3_I]], ptr [[ARG]], align 8 +; O2-NEXT: [[I3:%.*]] = load i32, ptr [[I_I]], align 4 +; O2-NEXT: ret i32 [[I3]] +; +bb: + %i = call ptr @bar(ptr %arg, i1 %arg1) + %i2 = icmp ne ptr %i, null + call void @llvm.assume(i1 %i2) + %i3 = load i32, ptr %i, align 4 + ret i32 %i3 +} + +declare void @llvm.assume(i1) +;. +; O2: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll new file mode 100644 index 00000000000000..a4e049941030ef --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=x86_64-pc-linux-gnu -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s + +; Constant length memcpy.inline should be left unmodified. +define void @memcpy_32(ptr %dst, ptr %src) nounwind { +; CHECK-LABEL: define void @memcpy_32( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 32, i1 false) +; CHECK-NEXT: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 32, i1 true) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 32, i1 0) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 32, i1 1) + ret void +} + +define void @memcpy_x(ptr %dst, ptr %src, i64 %x) nounwind { +; CHECK-LABEL: define void @memcpy_x( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] +; CHECK: [[LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[LOOP_INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[LOOP_INDEX]] +; CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[X]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[POST_LOOP_MEMCPY_EXPANSION]] +; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[X]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[LOOP_MEMCPY_EXPANSION2:.*]], label %[[POST_LOOP_MEMCPY_EXPANSION1:.*]] +; CHECK: [[LOOP_MEMCPY_EXPANSION2]]: +; CHECK-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, %[[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP11:%.*]], %[[LOOP_MEMCPY_EXPANSION2]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[LOOP_INDEX3]] +; CHECK-NEXT: [[TMP9:%.*]] = load volatile i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[LOOP_INDEX3]] +; CHECK-NEXT: store volatile i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[LOOP_INDEX3]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[TMP11]], [[X]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[LOOP_MEMCPY_EXPANSION2]], label %[[POST_LOOP_MEMCPY_EXPANSION1]] +; CHECK: [[POST_LOOP_MEMCPY_EXPANSION1]]: +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 %x, i1 0) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 %x, i1 1) + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll index 283cc07dfb9b96..e60e356e5cd819 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -75,48 +75,64 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], [[TMP57]] +; CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0 +; CHECK-NEXT: [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]] +; CHECK-NEXT: [[CONV48:%.*]] = trunc i64 [[ADD]] to i32 +; CHECK-NEXT: [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0 +; CHECK-NEXT: [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1 +; CHECK-NEXT: [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]] +; CHECK-NEXT: [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32 +; CHECK-NEXT: [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0 +; CHECK-NEXT: [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1 +; CHECK-NEXT: [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]] +; CHECK-NEXT: [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32 +; CHECK-NEXT: [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0 +; CHECK-NEXT: [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1 +; CHECK-NEXT: [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]] +; CHECK-NEXT: [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127 ; CHECK-NEXT: [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31 ; CHECK-NEXT: br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]] ; CHECK: while.body88: ; CHECK-NEXT: [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ] ; CHECK-NEXT: [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ] +; CHECK-NEXT: [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ] +; CHECK-NEXT: [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ] +; CHECK-NEXT: [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ] +; CHECK-NEXT: [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ] -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY93:%.*]] ; CHECK: while.body93: -; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[AND94:%.*]] = and i32 [[A_0279]], 1 ; CHECK-NEXT: [[AND95:%.*]] = and i32 [[B_0278]], 1 ; CHECK-NEXT: [[SHR96]] = lshr i32 [[A_0279]], 1 ; CHECK-NEXT: [[SHR97]] = lshr i32 [[B_0278]], 1 ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0 ; CHECK-NEXT: [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false +; CHECK-NEXT: [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]] ; CHECK-NEXT: [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0 ; CHECK-NEXT: [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL100]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TOBOOL]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL98]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i1> [[TMP25]], i1 [[TOBOOL103]], i32 1 -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i1> [[TMP27]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32> -; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]] +; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false +; CHECK-NEXT: [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32 +; CHECK-NEXT: [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false +; CHECK-NEXT: [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]] +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false +; CHECK-NEXT: [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32 +; CHECK-NEXT: [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]] @@ -128,53 +144,61 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]] ; CHECK: while.end122: ; CHECK-NEXT: [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ] +; CHECK-NEXT: [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ] +; CHECK-NEXT: [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ] +; CHECK-NEXT: [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ] +; CHECK-NEXT: [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ] -; CHECK-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ] ; CHECK-NEXT: [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]] ; CHECK: while.body132.preheader: -; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 ; CHECK-NEXT: [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]] -; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]] -; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 -; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]] +; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]] ; CHECK-NEXT: br label [[WHILE_BODY132:%.*]] ; CHECK: while.body132: +; CHECK-NEXT: [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] +; CHECK-NEXT: [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] +; CHECK-NEXT: [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] +; CHECK-NEXT: [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[AND133:%.*]] = and i32 [[A_1301]], 1 ; CHECK-NEXT: [[AND134:%.*]] = and i32 [[B_1300]], 1 ; CHECK-NEXT: [[SHR135]] = lshr i32 [[A_1301]], 1 ; CHECK-NEXT: [[SHR136]] = lshr i32 [[B_1300]], 1 ; CHECK-NEXT: [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0 ; CHECK-NEXT: [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false +; CHECK-NEXT: [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32 +; CHECK-NEXT: [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]] ; CHECK-NEXT: [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0 ; CHECK-NEXT: [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL144]], i32 0 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i1> [[TMP40]], i1 [[TOBOOL137]], i32 1 -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP41]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL139]], i32 0 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP42]], i1 [[TOBOOL147]], i32 1 -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP39]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32> -; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]] +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false +; CHECK-NEXT: [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32 +; CHECK-NEXT: [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]] +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false +; CHECK-NEXT: [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32 +; CHECK-NEXT: [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false +; CHECK-NEXT: [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32 +; CHECK-NEXT: [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]] ; CHECK-NEXT: [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1 ; CHECK-NEXT: [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]] ; CHECK: while.end166: -; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3 -; CHECK-NEXT: store i32 [[TMP53]], ptr [[CTT:%.*]], align 4 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2 -; CHECK-NEXT: store i32 [[TMP54]], ptr [[CFF:%.*]], align 4 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1 -; CHECK-NEXT: store i32 [[TMP55]], ptr [[CTF:%.*]], align 4 -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0 -; CHECK-NEXT: store i32 [[TMP56]], ptr [[CFT:%.*]], align 4 +; CHECK-NEXT: [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ] +; CHECK-NEXT: [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ] +; CHECK-NEXT: [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ] +; CHECK-NEXT: [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ] +; CHECK-NEXT: store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4 +; CHECK-NEXT: store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4 +; CHECK-NEXT: store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4 +; CHECK-NEXT: store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index a24cb81541d7c1..c6209fd71063a0 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -10,33 +10,32 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[ARG3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], +; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP8]], i32 2 -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP10]], 2.000000e+00 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP11]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP13]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = uitofp <4 x i8> [[TMP14]] to <4 x float> -; CHECK-NEXT: [[TMP16:%.*]] = fsub fast <4 x float> [[TMP15]], [[TMP3]] -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP12]] -; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]]) +; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float> +; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP18]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll index a009841de6e65f..644d645b9dc88d 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: min_double ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-3' +; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '6' define i32 @min_double(ptr noalias nocapture %A, ptr noalias nocapture %B) { @@ -76,7 +76,7 @@ entry: ; YAML-NEXT: Function: max_double ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-3' +; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '6' define i32 @max_double(ptr noalias nocapture %A, ptr noalias nocapture %B) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll index 40ca0150d8e744..0b9ed47ce0f178 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -13,28 +13,30 @@ define void @get_block(i32 %y_pos) local_unnamed_addr #0 { ; CHECK: if.end: ; CHECK-NEXT: [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef ; CHECK-NEXT: [[SHR15:%.*]] = ashr i32 [[SUB14]], 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP17]] +; CHECK-NEXT: [[CMP_I_I:%.*]] = icmp sgt i32 [[SHR15]], 0 +; CHECK-NEXT: [[COND_I_I:%.*]] = select i1 [[CMP_I_I]], i32 [[SHR15]], i32 0 +; CHECK-NEXT: [[CMP_I4_I:%.*]] = icmp slt i32 [[COND_I_I]], undef +; CHECK-NEXT: [[COND_I5_I:%.*]] = select i1 [[CMP_I4_I]], i32 [[COND_I_I]], i32 undef +; CHECK-NEXT: [[IDXPROM30:%.*]] = sext i32 [[COND_I5_I]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30]] +; CHECK-NEXT: [[CMP_I_I_1:%.*]] = icmp sgt i32 [[SUB14]], -1 +; CHECK-NEXT: [[COND_I_I_1:%.*]] = select i1 [[CMP_I_I_1]], i32 undef, i32 0 +; CHECK-NEXT: [[CMP_I4_I_1:%.*]] = icmp slt i32 [[COND_I_I_1]], undef +; CHECK-NEXT: [[COND_I5_I_1:%.*]] = select i1 [[CMP_I4_I_1]], i32 [[COND_I_I_1]], i32 undef +; CHECK-NEXT: [[IDXPROM30_1:%.*]] = sext i32 [[COND_I5_I_1]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_1]] +; CHECK-NEXT: [[CMP_I_I_2:%.*]] = icmp sgt i32 [[SUB14]], -5 +; CHECK-NEXT: [[COND_I_I_2:%.*]] = select i1 [[CMP_I_I_2]], i32 undef, i32 0 +; CHECK-NEXT: [[CMP_I4_I_2:%.*]] = icmp slt i32 [[COND_I_I_2]], undef +; CHECK-NEXT: [[COND_I5_I_2:%.*]] = select i1 [[CMP_I4_I_2]], i32 [[COND_I_I_2]], i32 undef +; CHECK-NEXT: [[IDXPROM30_2:%.*]] = sext i32 [[COND_I5_I_2]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_2]] +; CHECK-NEXT: [[CMP_I_I_3:%.*]] = icmp sgt i32 [[SUB14]], -9 +; CHECK-NEXT: [[COND_I_I_3:%.*]] = select i1 [[CMP_I_I_3]], i32 undef, i32 0 +; CHECK-NEXT: [[CMP_I4_I_3:%.*]] = icmp slt i32 [[COND_I_I_3]], undef +; CHECK-NEXT: [[COND_I5_I_3:%.*]] = select i1 [[CMP_I4_I_3]], i32 [[COND_I_I_3]], i32 undef +; CHECK-NEXT: [[IDXPROM30_3:%.*]] = sext i32 [[COND_I5_I_3]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_3]] ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll index bf4903fd19b09b..2f5936357d0f5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll @@ -7,8 +7,7 @@ ; YAML-NEXT: Name: NotPossible ; YAML-NEXT: Function: g ; YAML-NEXT: Args: -; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible' -; YAML-NEXT: - String: ' with available vectorization factors' +; YAML-NEXT: - String: 'Cannot SLP vectorize list: only 2 elements of buildvector, trying reduction first.' define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) { ; CHECK-LABEL: @g( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll new file mode 100644 index 00000000000000..5f62def150d8f7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s + +define i1 @src(i1 %cmp4.118.i) { +; CHECK-LABEL: define i1 @src( +; CHECK-SAME: i1 [[CMP4_118_I:%.*]]) { +; CHECK-NEXT: [[CMP4_118_I_NOT:%.*]] = xor i1 [[CMP4_118_I]], true +; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> poison +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[CMP4_118_I_NOT]], i1 true, i1 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[OP_RDX]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP3]], i1 true, i1 poison +; CHECK-NEXT: ret i1 [[OP_RDX1]] +; + %cmp4.118.i.not = xor i1 %cmp4.118.i, true + %brmerge = select i1 %cmp4.118.i.not, i1 true, i1 poison + %.not = xor i1 poison, true + %brmerge2 = select i1 %brmerge, i1 true, i1 %.not + %.not3 = xor i1 poison, true + %brmerge4 = select i1 %brmerge2, i1 true, i1 %.not3 + %.not5 = xor i1 poison, true + %brmerge6 = select i1 %brmerge4, i1 true, i1 %.not5 + %.not7 = xor i1 poison, true + %brmerge8 = select i1 %brmerge6, i1 true, i1 %.not7 + ret i1 %brmerge8 +} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserving-dropping-debugloc-nontrivial.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserving-dropping-debugloc-nontrivial.ll new file mode 100644 index 00000000000000..bdb97ff2206085 --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserving-dropping-debugloc-nontrivial.ll @@ -0,0 +1,75 @@ +; RUN: opt -passes='simple-loop-unswitch' -S < %s | FileCheck %s + +define i32 @basic(i32 %N, i1 %cond, i32 %select_input) !dbg !5 { +; CHECK-LABEL: define i32 @basic( + +; Check that SimpleLoopUnswitch's unswitchNontrivialInvariants() drops the +; debug location of the hoisted terminator and doesn't give any debug location +; to the new freeze, since it's inserted in a hoist block. +; Also check that in unswitchNontrivialInvariants(), the new br instruction +; inherits the debug location of the old terminator in the same block. + +; CHECK: entry: +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND:%.*]]{{$}} +; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]{{$}} +; CHECK: for.body.us: +; CHECK-NEXT: br label %0, !dbg [[DBG13:![0-9]+]] + +; Check that in turnSelectIntoBranch(), the new phi inherits the debug +; location of the old select instruction replaced. + +; CHECK: 1: +; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[SELECT_INPUT:%.*]], %0 ], !dbg [[DBG13]] + +; Check that in BuildClonedLoopBlocks(), the new phi inherits the debug +; location of the instruction at the insertion point and the new br +; instruction inherits the debug location of the old terminator. + +; CHECK: for.body: +; CHECK-NEXT: br label %2, !dbg [[DBG13]] +; CHECK: for.cond.cleanup: +; CHECK: [[DOTUS_PHI:%.*]] = phi i32 [ [[RES_LCSSA:%.*]], %[[FOR_COND_CLEANUP_SPLIT:.*]] ], [ [[RES_LCSSA_US:%.*]], %[[FOR_COND_CLEANUP_SPLIT_US:.*]] ], !dbg [[DBG17:![0-9]+]] +entry: + br label %for.cond, !dbg !8 + +for.cond: ; preds = %for.body, %entry + %res = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !9 + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ], !dbg !10 + %cmp = icmp slt i32 %i, %N, !dbg !11 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !12 + +for.body: ; preds = %for.cond + %cond1 = select i1 %cond, i32 %select_input, i32 42, !dbg !13 + %add = add nuw nsw i32 %cond1, %res, !dbg !14 + %inc = add nuw nsw i32 %i, 1, !dbg !15 + br label %for.cond, !dbg !16 + +for.cond.cleanup: ; preds = %for.cond + ret i32 %res, !dbg !17 +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +; CHECK: [[DBG13]] = !DILocation(line: 6, +; CHECK: [[DBG17]] = !DILocation(line: 10, + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "main.ll", directory: "/") +!2 = !{i32 10} +!3 = !{i32 0} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "basic", linkageName: "basic", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +!9 = !DILocation(line: 2, column: 1, scope: !5) +!10 = !DILocation(line: 3, column: 1, scope: !5) +!11 = !DILocation(line: 4, column: 1, scope: !5) +!12 = !DILocation(line: 5, column: 1, scope: !5) +!13 = !DILocation(line: 6, column: 1, scope: !5) +!14 = !DILocation(line: 7, column: 1, scope: !5) +!15 = !DILocation(line: 8, column: 1, scope: !5) +!16 = !DILocation(line: 9, column: 1, scope: !5) +!17 = !DILocation(line: 10, column: 1, scope: !5) diff --git a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll index ef2d3219cca9b6..c4602e72ecbce0 100644 --- a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll +++ b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll @@ -20,7 +20,7 @@ F: define void @test2() personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @test2() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: call void @test2() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: ret void ; entry: @@ -242,6 +242,8 @@ declare ptr @fn_nonnull_deref_arg(ptr nonnull dereferenceable(4) %p) declare ptr @fn_nonnull_deref_or_null_arg(ptr nonnull dereferenceable_or_null(4) %p) declare ptr @fn_nonnull_arg(ptr nonnull %p) declare ptr @fn_noundef_arg(ptr noundef %p) +declare ptr @fn_ptr_arg(ptr) +declare ptr @fn_ptr_arg_nounwind_willreturn(ptr) nounwind willreturn define void @test9(i1 %X, ptr %Y) { ; CHECK-LABEL: @test9( @@ -855,10 +857,72 @@ exit: ret i32 %res } +; From bb to bb5 is UB. +define i32 @test9_null_user_order_1(ptr %arg, i1 %arg1, ptr %arg2) { +; CHECK-LABEL: @test9_null_user_order_1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[TMP0]]) +; CHECK-NEXT: [[I:%.*]] = load ptr, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i8, ptr [[I]], i64 1 +; CHECK-NEXT: store ptr [[I4]], ptr [[ARG]], align 8 +; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[I8:%.*]] = icmp ne ptr [[I]], [[ARG2:%.*]] +; CHECK-NEXT: call void @fn_ptr_arg(i1 [[I8]]) +; CHECK-NEXT: ret i32 [[I7]] +; +bb: + br i1 %arg1, label %bb5, label %bb3 + +bb3: ; preds = %bb + %i = load ptr, ptr %arg, align 8 + %i4 = getelementptr inbounds i8, ptr %i, i64 1 + store ptr %i4, ptr %arg, align 8 + br label %bb5 + +bb5: ; preds = %bb3, %bb + %i6 = phi ptr [ %i, %bb3 ], [ null, %bb ] + %i7 = load i32, ptr %i6, align 4 + %i8 = icmp ne ptr %i6, %arg2 + call void @fn_ptr_arg(i1 %i8) + ret i32 %i7 +} + +define i32 @test9_null_user_order_2(ptr %arg, i1 %arg1, ptr %arg2) { +; CHECK-LABEL: @test9_null_user_order_2( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true +; CHECK-NEXT: call void @llvm.assume(i1 [[TMP0]]) +; CHECK-NEXT: [[I:%.*]] = load ptr, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i8, ptr [[I]], i64 1 +; CHECK-NEXT: store ptr [[I4]], ptr [[ARG]], align 8 +; CHECK-NEXT: [[I8:%.*]] = icmp ne ptr [[I]], [[ARG2:%.*]] +; CHECK-NEXT: call void @fn_ptr_arg_nounwind_willreturn(i1 [[I8]]) +; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: ret i32 [[I7]] +; +bb: + br i1 %arg1, label %bb5, label %bb3 + +bb3: ; preds = %bb + %i = load ptr, ptr %arg, align 8 + %i4 = getelementptr inbounds i8, ptr %i, i64 1 + store ptr %i4, ptr %arg, align 8 + br label %bb5 + +bb5: ; preds = %bb3, %bb + %i6 = phi ptr [ %i, %bb3 ], [ null, %bb ] + %i8 = icmp ne ptr %i6, %arg2 + call void @fn_ptr_arg_nounwind_willreturn(i1 %i8) + %i7 = load i32, ptr %i6, align 4 + ret i32 %i7 +} + attributes #0 = { null_pointer_is_valid } ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { null_pointer_is_valid } -; CHECK: attributes #[[ATTR3]] = { nounwind } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn } +; CHECK: attributes #[[ATTR4]] = { nounwind } ;. diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index a2c82ddd19480a..af04fb0ab4621b 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -993,6 +993,41 @@ define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %va ret void } +; Peek through (repeated) bitcasts to find a common source value. +define <4 x i64> @bitcast_smax_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: @bitcast_smax_v8i32_v4i32( +; CHECK-NEXT: [[A_BC0:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32> +; CHECK-NEXT: [[B_BC0:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32> +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <8 x i32> [[A_BC0]], [[B_BC0]] +; CHECK-NEXT: [[A_BC1:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32> +; CHECK-NEXT: [[B_BC1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +; CHECK-NEXT: [[CONCAT:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[B_BC1]], <8 x i32> [[A_BC1]] +; CHECK-NEXT: [[RES:%.*]] = bitcast <8 x i32> [[CONCAT]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[RES]] +; + %a.bc0 = bitcast <4 x i64> %a to <8 x i32> + %b.bc0 = bitcast <4 x i64> %b to <8 x i32> + %cmp = icmp slt <8 x i32> %a.bc0, %b.bc0 + %cmp.lo = shufflevector <8 x i1> %cmp, <8 x i1> poison, <4 x i32> + %cmp.hi = shufflevector <8 x i1> %cmp, <8 x i1> poison, <4 x i32> + + %a.bc1 = bitcast <4 x i64> %a to <8 x i32> + %b.bc1 = bitcast <4 x i64> %b to <8 x i32> + %a.lo = shufflevector <8 x i32> %a.bc1, <8 x i32> poison, <4 x i32> + %b.lo = shufflevector <8 x i32> %b.bc1, <8 x i32> poison, <4 x i32> + %lo = select <4 x i1> %cmp.lo, <4 x i32> %b.lo, <4 x i32> %a.lo + + %a.bc2 = bitcast <4 x i64> %a to <8 x i32> + %b.bc2 = bitcast <4 x i64> %b to <8 x i32> + %a.hi = shufflevector <8 x i32> %a.bc2, <8 x i32> poison, <4 x i32> + %b.hi = shufflevector <8 x i32> %b.bc2, <8 x i32> poison, <4 x i32> + %hi = select <4 x i1> %cmp.hi, <4 x i32> %b.hi, <4 x i32> %a.hi + + %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> + %res = bitcast <8 x i32> %concat to <4 x i64> + ret <4 x i64> %res +} + define void @bitcast_srcty_mismatch() { ; CHECK-LABEL: @bitcast_srcty_mismatch( ; CHECK-NEXT: entry: diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll index b1b9f7ee4be112..ad70b17e5fb727 100644 --- a/llvm/test/Verifier/intrinsic-immarg.ll +++ b/llvm/test/Verifier/intrinsic-immarg.ll @@ -36,14 +36,6 @@ define void @memcpy_inline_is_volatile(ptr %dest, ptr %src, i1 %is.volatile) { ret void } -define void @memcpy_inline_variable_size(ptr %dest, ptr %src, i32 %size) { - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i32 %size - ; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i32(ptr %dest, ptr %src, i32 %size, i1 true) - call void @llvm.memcpy.inline.p0.p0.i32(ptr %dest, ptr %src, i32 %size, i1 true) - ret void -} - declare void @llvm.memmove.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) define void @memmove(ptr %dest, ptr %src, i1 %is.volatile) { ; CHECK: immarg operand has non-immediate parameter diff --git a/llvm/test/tools/llvm-dlltool/coff-decorated.def b/llvm/test/tools/llvm-dlltool/coff-decorated.def index fc81f23d09d6c4..f5685fb1cf0c6a 100644 --- a/llvm/test/tools/llvm-dlltool/coff-decorated.def +++ b/llvm/test/tools/llvm-dlltool/coff-decorated.def @@ -7,12 +7,16 @@ EXPORTS CdeclFunction StdcallFunction@4 @FastcallFunction@4 -StdcallAlias@4==StdcallFunction@4 +StdcallAlias@4==StdcallFunction ??_7exception@@6B@ StdcallExportName@4=StdcallInternalFunction@4 OtherStdcallExportName@4=CdeclInternalFunction CdeclExportName=StdcallInternalFunction@4 +NoprefixStdcall@4 == NoprefixStdcall@4 +DecoratedStdcall@4 == _DecoratedStdcall@4 +UndecoratedStdcall@4 == UndecoratedStdcall + ; CHECK: Name type: noprefix ; CHECK-NEXT: Export name: CdeclFunction ; CHECK-NEXT: Symbol: __imp__CdeclFunction @@ -43,3 +47,15 @@ CdeclExportName=StdcallInternalFunction@4 ; CHECK-NEXT: Export name: CdeclExportName ; CHECK-NEXT: Symbol: __imp__CdeclExportName ; CHECK-NEXT: Symbol: _CdeclExportName +; CHECK: Name type: noprefix +; CHECK-NEXT: Export name: NoprefixStdcall@4 +; CHECK-NEXT: Symbol: __imp__NoprefixStdcall@4 +; CHECK-NEXT: Symbol: _NoprefixStdcall@4 +; CHECK: Name type: name +; CHECK-NEXT: Export name: _DecoratedStdcall@4 +; CHECK-NEXT: Symbol: __imp__DecoratedStdcall@4 +; CHECK-NEXT: Symbol: _DecoratedStdcall@4 +; CHECK: Name type: undecorate +; CHECK-NEXT: Export name: UndecoratedStdcall +; CHECK-NEXT: Symbol: __imp__UndecoratedStdcall@4 +; CHECK-NEXT: Symbol: _UndecoratedStdcall@4 diff --git a/llvm/test/tools/llvm-dlltool/coff-weak-exports.def b/llvm/test/tools/llvm-dlltool/coff-weak-exports.def index 67f0013bf170f8..b08040e29fa42e 100644 --- a/llvm/test/tools/llvm-dlltool/coff-weak-exports.def +++ b/llvm/test/tools/llvm-dlltool/coff-weak-exports.def @@ -5,6 +5,9 @@ LIBRARY test.dll EXPORTS +AltTestFunction +AltTestFunction2 +AltTestData TestFunction==AltTestFunction TestData DATA == AltTestData ; When creating an import library, the DLL internal function name of @@ -17,6 +20,14 @@ ImpLibName2 = Implementation2 == AltTestFunction2 ; matter for the import library ImpLibName3 = kernel32.Sleep +; CHECK: T AltTestFunction +; CHECK-NEXT: T __imp_AltTestFunction +; CHECK: T AltTestFunction2 +; CHECK-NEXT: T __imp_AltTestFunction2 +; CHECK: T ImpLibName +; CHECK-NEXT: T __imp_ImpLibName +; CHECK: T ImpLibName3 +; CHECK-NEXT: T __imp_ImpLibName3 ; CHECK: U AltTestFunction ; CHECK-NEXT: W TestFunction ; CHECK: U __imp_AltTestFunction @@ -24,14 +35,10 @@ ImpLibName3 = kernel32.Sleep ; CHECK-NOT: W TestData ; CHECK: U __imp_AltTestData ; CHECK-NEXT: W __imp_TestData -; CHECK: T ImpLibName -; CHECK-NEXT: T __imp_ImpLibName ; CHECK: U AltTestFunction2 ; CHECK-NEXT: W ImpLibName2 ; CHECK: U __imp_AltTestFunction2 ; CHECK-NEXT: W __imp_ImpLibName2 -; CHECK: T ImpLibName3 -; CHECK-NEXT: T __imp_ImpLibName3 ; ARCH-NOT: unknown arch diff --git a/llvm/test/tools/llvm-dlltool/renaming.def b/llvm/test/tools/llvm-dlltool/renaming.def new file mode 100644 index 00000000000000..57fd472aa37cf7 --- /dev/null +++ b/llvm/test/tools/llvm-dlltool/renaming.def @@ -0,0 +1,39 @@ +; RUN: llvm-dlltool -k -m i386 --input-def %s --output-lib %t.a +; RUN: llvm-readobj %t.a | FileCheck %s +; RUN: llvm-nm %t.a | FileCheck %s -check-prefix=CHECK-NM + +LIBRARY test.dll +EXPORTS + +symbolname == actualimport + +dataname DATA == actualdata + +_wcstok == wcstok +wcstok == wcstok_s + +; CHECK-NM-NOT: actualimport +; CHECK-NM-NOT: actualdata + +; CHECK: Type: code +; CHECK-NEXT: Name type: export as +; CHECK-NEXT: Export name: actualimport +; CHECK-NEXT: Symbol: __imp__symbolname +; CHECK-NEXT: Symbol: _symbolname + +; CHECK: Type: data +; CHECK-NEXT: Name type: export as +; CHECK-NEXT: Export name: actualdata +; CHECK-NEXT: Symbol: __imp__dataname + +; CHECK: Type: code +; CHECK-NEXT: Name type: export as +; CHECK-NEXT: Export name: wcstok +; CHECK-NEXT: Symbol: __imp___wcstok +; CHECK-NEXT: Symbol: __wcstok + +; CHECK: Type: code +; CHECK-NEXT: Name type: export as +; CHECK-NEXT: Export name: wcstok_s +; CHECK-NEXT: Symbol: __imp__wcstok +; CHECK-NEXT: Symbol: _wcstok diff --git a/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test b/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test index 3b94c7994cf3c0..44b5400f059c1f 100644 --- a/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test +++ b/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test @@ -1,11 +1,11 @@ # RUN: yaml2obj %s -o %t.o -# RUN: llvm-readelf -A %t.o 2>&1 | FileCheck %s +# RUN: llvm-readobj -A %t.o 2>&1 | FileCheck %s # CHECK: BuildAttributes { # CHECK-NEXT: FormatVersion: 0x41 # CHECK-NEXT: Section 1 { -# CHECK-NEXT: SectionLength: 22 -# CHECK-NEXT: Vendor: armabi +# CHECK-NEXT: SectionLength: 6 +# CHECK-NEXT: Vendor: aeabi # CHECK-NEXT: } # CHECK-NEXT: } @@ -18,6 +18,5 @@ FileHeader: Sections: - Name: .ARM.attributes Type: SHT_ARM_ATTRIBUTES - ContentArray: [ 0x41, 0x00, 0x00, 0x00, 0x16, 0x61, 0x72, 0x6D, 0x61, 0x62, - 0x69, 0x00, 0x01, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x01, 0x06, 0x01, 0x08, - 0x01 ] + ContentArray: [ 0x41, 0x00, 0x00, 0x00, 0x06, 0x61, 0x65, 0x61, 0x62, 0x69, + 0x00 ] diff --git a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll new file mode 100644 index 00000000000000..f579056e914aa4 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll @@ -0,0 +1,15 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s + +; Check that all declarations are put into each partition. + +; CHECK0: declare void @A +; CHECK0: declare void @B + +; CHECK1: declare void @A +; CHECK1: declare void @B + +declare void @A() + +declare void @B() diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp index fb1959c6457f4a..31c089e6344d0b 100644 --- a/llvm/tools/llc/NewPMDriver.cpp +++ b/llvm/tools/llc/NewPMDriver.cpp @@ -116,7 +116,6 @@ int llvm::compileModuleWithNewPM( PassInstrumentationCallbacks PIC; StandardInstrumentations SI(Context, Opt.DebugPM, !NoVerify); - SI.registerCallbacks(PIC); registerCodeGenCallback(PIC, LLVMTM); MachineFunctionAnalysisManager MFAM; @@ -131,6 +130,7 @@ int llvm::compileModuleWithNewPM( PB.registerLoopAnalyses(LAM); PB.registerMachineFunctionAnalyses(MFAM); PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); + SI.registerCallbacks(PIC, &MAM); FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); }); MAM.registerPass([&] { return MachineModuleAnalysis(MMI); }); diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp index 6fa36421810f0f..c9bf03229683af 100644 --- a/llvm/tools/llvm-c-test/echo.cpp +++ b/llvm/tools/llvm-c-test/echo.cpp @@ -412,10 +412,9 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) { SmallVector Idx; for (int i = 1; i <= NumIdx; i++) Idx.push_back(clone_constant(LLVMGetOperand(Cst, i), M)); - if (LLVMIsInBounds(Cst)) - return LLVMConstInBoundsGEP2(ElemTy, Ptr, Idx.data(), NumIdx); - else - return LLVMConstGEP2(ElemTy, Ptr, Idx.data(), NumIdx); + + return LLVMConstGEPWithNoWrapFlags(ElemTy, Ptr, Idx.data(), NumIdx, + LLVMGEPGetNoWrapFlags(Cst)); } default: fprintf(stderr, "%d is not a supported opcode for constant expressions\n", @@ -767,11 +766,10 @@ struct FunCloner { int NumIdx = LLVMGetNumIndices(Src); for (int i = 1; i <= NumIdx; i++) Idx.push_back(CloneValue(LLVMGetOperand(Src, i))); - if (LLVMIsInBounds(Src)) - Dst = LLVMBuildInBoundsGEP2(Builder, ElemTy, Ptr, Idx.data(), NumIdx, - Name); - else - Dst = LLVMBuildGEP2(Builder, ElemTy, Ptr, Idx.data(), NumIdx, Name); + + Dst = LLVMBuildGEPWithNoWrapFlags(Builder, ElemTy, Ptr, Idx.data(), + NumIdx, Name, + LLVMGEPGetNoWrapFlags(Src)); break; } case LLVMAtomicRMW: { diff --git a/llvm/tools/lto/lto.cpp b/llvm/tools/lto/lto.cpp index ece6dd0f108301..d68cff839604f6 100644 --- a/llvm/tools/lto/lto.cpp +++ b/llvm/tools/lto/lto.cpp @@ -691,7 +691,7 @@ extern const char *lto_input_get_dependent_library(lto_input_t input, } extern const char *const *lto_runtime_lib_symbols_list(size_t *size) { - auto symbols = lto::LTO::getRuntimeLibcallSymbols(); + auto symbols = lto::LTO::getRuntimeLibcallSymbols(Triple()); *size = symbols.size(); return symbols.data(); } diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp index 14958aa646a04d..48b4d37558af17 100644 --- a/llvm/unittests/Analysis/VectorUtilsTest.cpp +++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp @@ -242,6 +242,30 @@ TEST_F(BasicTest, getShuffleDemandedElts) { EXPECT_EQ(RHS.getZExtValue(), 0x9U); } +TEST_F(BasicTest, getHorizontalDemandedEltsForFirstOperand) { + APInt LHS, RHS; + + getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0000), LHS, RHS); + EXPECT_EQ(LHS.getZExtValue(), 0b0000U); + EXPECT_EQ(RHS.getZExtValue(), 0b0000U); + + getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0001), LHS, RHS); + EXPECT_EQ(LHS.getZExtValue(), 0b0001U); + EXPECT_EQ(RHS.getZExtValue(), 0b0000U); + + getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b1000), LHS, RHS); + EXPECT_EQ(LHS.getZExtValue(), 0b0000U); + EXPECT_EQ(RHS.getZExtValue(), 0b0100U); + + getHorizDemandedEltsForFirstOperand(128, APInt(4, 0b0110), LHS, RHS); + EXPECT_EQ(LHS.getZExtValue(), 0b0100U); + EXPECT_EQ(RHS.getZExtValue(), 0b0001U); + + getHorizDemandedEltsForFirstOperand(256, APInt(4, 0b0100), LHS, RHS); + EXPECT_EQ(LHS.getZExtValue(), 0b0100U); + EXPECT_EQ(RHS.getZExtValue(), 0b0000U); +} + TEST_F(BasicTest, getSplatIndex) { EXPECT_EQ(getSplatIndex({0,0,0}), 0); EXPECT_EQ(getSplatIndex({1,0,0}), -1); // no splat diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index 46c385a0bc050e..a3d5e5f94b6109 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -119,6 +119,41 @@ TEST_F(SelectionDAGPatternMatchTest, matchValueType) { EXPECT_FALSE(sd_match(Op2, m_ScalableVectorVT())); } +TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) { + SDLoc DL; + auto Int32VT = EVT::getIntegerVT(Context, 32); + + SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT); + SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT); + + SDValue ICMP_UGT = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETUGT); + SDValue ICMP_EQ01 = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETEQ); + SDValue ICMP_EQ10 = DAG->getSetCC(DL, MVT::i1, Op1, Op0, ISD::SETEQ); + + using namespace SDPatternMatch; + ISD::CondCode CC; + EXPECT_TRUE(sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(), + m_SpecificCondCode(ISD::SETUGT)))); + EXPECT_TRUE( + sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(), m_CondCode(CC)))); + EXPECT_TRUE(CC == ISD::SETUGT); + EXPECT_FALSE(sd_match( + ICMP_UGT, m_SetCC(m_Value(), m_Value(), m_SpecificCondCode(ISD::SETLE)))); + + EXPECT_TRUE(sd_match(ICMP_EQ01, m_SetCC(m_Specific(Op0), m_Specific(Op1), + m_SpecificCondCode(ISD::SETEQ)))); + EXPECT_TRUE(sd_match(ICMP_EQ10, m_SetCC(m_Specific(Op1), m_Specific(Op0), + m_SpecificCondCode(ISD::SETEQ)))); + EXPECT_FALSE(sd_match(ICMP_EQ01, m_SetCC(m_Specific(Op1), m_Specific(Op0), + m_SpecificCondCode(ISD::SETEQ)))); + EXPECT_FALSE(sd_match(ICMP_EQ10, m_SetCC(m_Specific(Op0), m_Specific(Op1), + m_SpecificCondCode(ISD::SETEQ)))); + EXPECT_TRUE(sd_match(ICMP_EQ01, m_c_SetCC(m_Specific(Op1), m_Specific(Op0), + m_SpecificCondCode(ISD::SETEQ)))); + EXPECT_TRUE(sd_match(ICMP_EQ10, m_c_SetCC(m_Specific(Op0), m_Specific(Op1), + m_SpecificCondCode(ISD::SETEQ)))); +} + TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) { SDLoc DL; auto Int32VT = EVT::getIntegerVT(Context, 32); diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp index 5cb0310c0ad097..373a58d259af5e 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp @@ -384,15 +384,18 @@ void TestAllForms() { //---------------------------------------------------------------------- // Test reference forms //---------------------------------------------------------------------- - EXPECT_EQ(RefAddr, toReference(DieDG.find(Attr_DW_FORM_ref_addr), 0)); - EXPECT_EQ(Data1, toReference(DieDG.find(Attr_DW_FORM_ref1), 0)); - EXPECT_EQ(Data2, toReference(DieDG.find(Attr_DW_FORM_ref2), 0)); - EXPECT_EQ(Data4, toReference(DieDG.find(Attr_DW_FORM_ref4), 0)); - EXPECT_EQ(Data8, toReference(DieDG.find(Attr_DW_FORM_ref8), 0)); + EXPECT_EQ(RefAddr, + toDebugInfoReference(DieDG.find(Attr_DW_FORM_ref_addr), 0)); + EXPECT_EQ(Data1, toRelativeReference(DieDG.find(Attr_DW_FORM_ref1), 0)); + EXPECT_EQ(Data2, toRelativeReference(DieDG.find(Attr_DW_FORM_ref2), 0)); + EXPECT_EQ(Data4, toRelativeReference(DieDG.find(Attr_DW_FORM_ref4), 0)); + EXPECT_EQ(Data8, toRelativeReference(DieDG.find(Attr_DW_FORM_ref8), 0)); if (Version >= 4) { - EXPECT_EQ(Data8_2, toReference(DieDG.find(Attr_DW_FORM_ref_sig8), 0)); + EXPECT_EQ(Data8_2, + toSignatureReference(DieDG.find(Attr_DW_FORM_ref_sig8), 0)); } - EXPECT_EQ(UData[0], toReference(DieDG.find(Attr_DW_FORM_ref_udata), 0)); + EXPECT_EQ(UData[0], + toRelativeReference(DieDG.find(Attr_DW_FORM_ref_udata), 0)); //---------------------------------------------------------------------- // Test flag forms @@ -420,7 +423,7 @@ void TestAllForms() { // Test DWARF32/DWARF64 forms //---------------------------------------------------------------------- EXPECT_EQ(Dwarf32Values[0], - toReference(DieDG.find(Attr_DW_FORM_GNU_ref_alt), 0)); + toSupplementaryReference(DieDG.find(Attr_DW_FORM_GNU_ref_alt), 0)); if (Version >= 4) { EXPECT_EQ(Dwarf32Values[1], toSectionOffset(DieDG.find(Attr_DW_FORM_sec_offset), 0)); @@ -761,14 +764,14 @@ template void TestReferences() { EXPECT_TRUE(CU1Ref1DieDG.isValid()); EXPECT_EQ(CU1Ref1DieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU1Ref1DieDG.find(DW_AT_type), -1ULL)); + toRelativeReference(CU1Ref1DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our // base type DIE in CU1. auto CU1Ref2DieDG = CU1Ref1DieDG.getSibling(); EXPECT_TRUE(CU1Ref2DieDG.isValid()); EXPECT_EQ(CU1Ref2DieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU1Ref2DieDG.find(DW_AT_type), -1ULL)); + toRelativeReference(CU1Ref2DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our // base type DIE in CU1. @@ -776,7 +779,7 @@ template void TestReferences() { EXPECT_TRUE(CU1Ref4DieDG.isValid()); EXPECT_EQ(CU1Ref4DieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU1Ref4DieDG.find(DW_AT_type), -1ULL)); + toRelativeReference(CU1Ref4DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our // base type DIE in CU1. @@ -784,7 +787,7 @@ template void TestReferences() { EXPECT_TRUE(CU1Ref8DieDG.isValid()); EXPECT_EQ(CU1Ref8DieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU1Ref8DieDG.find(DW_AT_type), -1ULL)); + toRelativeReference(CU1Ref8DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our // base type DIE in CU1. @@ -792,7 +795,7 @@ template void TestReferences() { EXPECT_TRUE(CU1RefAddrDieDG.isValid()); EXPECT_EQ(CU1RefAddrDieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU1RefAddrDieDG.find(DW_AT_type), -1ULL)); + toDebugInfoReference(CU1RefAddrDieDG.find(DW_AT_type), -1ULL)); // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its // DW_AT_type points to our base type DIE. @@ -800,38 +803,38 @@ template void TestReferences() { EXPECT_TRUE(CU1ToCU2RefAddrDieDG.isValid()); EXPECT_EQ(CU1ToCU2RefAddrDieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU1ToCU2RefAddrDieDG.find(DW_AT_type), -1ULL)); + toDebugInfoReference(CU1ToCU2RefAddrDieDG.find(DW_AT_type), -1ULL)); // Verify the sibling of the base type DIE is our Ref1 DIE and that its // DW_AT_type points to our base type DIE. auto CU2Ref1DieDG = CU2TypeDieDG.getSibling(); EXPECT_TRUE(CU2Ref1DieDG.isValid()); EXPECT_EQ(CU2Ref1DieDG.getTag(), DW_TAG_variable); - EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU2Ref1DieDG.find(DW_AT_type), -1ULL)); + EXPECT_EQ(CU2TypeDieDG.getOffset() - CU2TypeDieDG.getDwarfUnit()->getOffset(), + toRelativeReference(CU2Ref1DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our // base type DIE in CU2. auto CU2Ref2DieDG = CU2Ref1DieDG.getSibling(); EXPECT_TRUE(CU2Ref2DieDG.isValid()); EXPECT_EQ(CU2Ref2DieDG.getTag(), DW_TAG_variable); - EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU2Ref2DieDG.find(DW_AT_type), -1ULL)); + EXPECT_EQ(CU2TypeDieDG.getOffset() - CU2TypeDieDG.getDwarfUnit()->getOffset(), + toRelativeReference(CU2Ref2DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our // base type DIE in CU2. auto CU2Ref4DieDG = CU2Ref2DieDG.getSibling(); EXPECT_TRUE(CU2Ref4DieDG.isValid()); EXPECT_EQ(CU2Ref4DieDG.getTag(), DW_TAG_variable); - EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU2Ref4DieDG.find(DW_AT_type), -1ULL)); + EXPECT_EQ(CU2TypeDieDG.getOffset() - CU2TypeDieDG.getDwarfUnit()->getOffset(), + toRelativeReference(CU2Ref4DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our // base type DIE in CU2. auto CU2Ref8DieDG = CU2Ref4DieDG.getSibling(); EXPECT_TRUE(CU2Ref8DieDG.isValid()); EXPECT_EQ(CU2Ref8DieDG.getTag(), DW_TAG_variable); - EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU2Ref8DieDG.find(DW_AT_type), -1ULL)); + EXPECT_EQ(CU2TypeDieDG.getOffset() - CU2TypeDieDG.getDwarfUnit()->getOffset(), + toRelativeReference(CU2Ref8DieDG.find(DW_AT_type), -1ULL)); // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our // base type DIE in CU2. @@ -839,7 +842,7 @@ template void TestReferences() { EXPECT_TRUE(CU2RefAddrDieDG.isValid()); EXPECT_EQ(CU2RefAddrDieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU2TypeDieDG.getOffset(), - toReference(CU2RefAddrDieDG.find(DW_AT_type), -1ULL)); + toDebugInfoReference(CU2RefAddrDieDG.find(DW_AT_type), -1ULL)); // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its // DW_AT_type points to our base type DIE. @@ -847,7 +850,7 @@ template void TestReferences() { EXPECT_TRUE(CU2ToCU1RefAddrDieDG.isValid()); EXPECT_EQ(CU2ToCU1RefAddrDieDG.getTag(), DW_TAG_variable); EXPECT_EQ(CU1TypeDieDG.getOffset(), - toReference(CU2ToCU1RefAddrDieDG.find(DW_AT_type), -1ULL)); + toDebugInfoReference(CU2ToCU1RefAddrDieDG.find(DW_AT_type), -1ULL)); } TEST(DWARFDebugInfo, TestDWARF32Version2Addr4References) { @@ -1662,14 +1665,20 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { std::optional FormValOpt1 = DWARFFormValue(); EXPECT_FALSE(toString(FormValOpt1).has_value()); EXPECT_FALSE(toUnsigned(FormValOpt1).has_value()); - EXPECT_FALSE(toReference(FormValOpt1).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt1).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt1).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt1).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt1).has_value()); EXPECT_FALSE(toSigned(FormValOpt1).has_value()); EXPECT_FALSE(toAddress(FormValOpt1).has_value()); EXPECT_FALSE(toSectionOffset(FormValOpt1).has_value()); EXPECT_FALSE(toBlock(FormValOpt1).has_value()); EXPECT_EQ(nullptr, toString(FormValOpt1, nullptr)); EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt1, InvalidU64)); - EXPECT_EQ(InvalidU64, toReference(FormValOpt1, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt1, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt1, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt1, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt1, InvalidU64)); EXPECT_EQ(InvalidU64, toAddress(FormValOpt1, InvalidU64)); EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt1, InvalidU64)); EXPECT_EQ(InvalidS64, toSigned(FormValOpt1, InvalidS64)); @@ -1681,14 +1690,20 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { EXPECT_FALSE(toString(FormValOpt2).has_value()); EXPECT_FALSE(toUnsigned(FormValOpt2).has_value()); - EXPECT_FALSE(toReference(FormValOpt2).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt2).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt2).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt2).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt2).has_value()); EXPECT_FALSE(toSigned(FormValOpt2).has_value()); EXPECT_TRUE(toAddress(FormValOpt2).has_value()); EXPECT_FALSE(toSectionOffset(FormValOpt2).has_value()); EXPECT_FALSE(toBlock(FormValOpt2).has_value()); EXPECT_EQ(nullptr, toString(FormValOpt2, nullptr)); EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt2, InvalidU64)); - EXPECT_EQ(InvalidU64, toReference(FormValOpt2, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt2, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt2, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt2, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt2, InvalidU64)); EXPECT_EQ(Address, toAddress(FormValOpt2, InvalidU64)); EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt2, InvalidU64)); EXPECT_EQ(InvalidS64, toSigned(FormValOpt2, InvalidU64)); @@ -1700,36 +1715,98 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { EXPECT_FALSE(toString(FormValOpt3).has_value()); EXPECT_TRUE(toUnsigned(FormValOpt3).has_value()); - EXPECT_FALSE(toReference(FormValOpt3).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt3).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt3).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt3).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt3).has_value()); EXPECT_TRUE(toSigned(FormValOpt3).has_value()); EXPECT_FALSE(toAddress(FormValOpt3).has_value()); EXPECT_FALSE(toSectionOffset(FormValOpt3).has_value()); EXPECT_FALSE(toBlock(FormValOpt3).has_value()); EXPECT_EQ(nullptr, toString(FormValOpt3, nullptr)); EXPECT_EQ(UData8, toUnsigned(FormValOpt3, InvalidU64)); - EXPECT_EQ(InvalidU64, toReference(FormValOpt3, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt3, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt3, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt3, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt3, InvalidU64)); EXPECT_EQ(InvalidU64, toAddress(FormValOpt3, InvalidU64)); EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt3, InvalidU64)); EXPECT_EQ((int64_t)UData8, toSigned(FormValOpt3, InvalidU64)); - // Test successful and unsuccessful reference decoding. + // Test successful and unsuccessful ref_addr decoding. uint32_t RefData = 0x11223344U; - std::optional FormValOpt4 = + std::optional FormValOpt4Addr = DWARFFormValue::createFromUValue(DW_FORM_ref_addr, RefData); - EXPECT_FALSE(toString(FormValOpt4).has_value()); - EXPECT_FALSE(toUnsigned(FormValOpt4).has_value()); - EXPECT_TRUE(toReference(FormValOpt4).has_value()); - EXPECT_FALSE(toSigned(FormValOpt4).has_value()); - EXPECT_FALSE(toAddress(FormValOpt4).has_value()); - EXPECT_FALSE(toSectionOffset(FormValOpt4).has_value()); - EXPECT_FALSE(toBlock(FormValOpt4).has_value()); - EXPECT_EQ(nullptr, toString(FormValOpt4, nullptr)); - EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt4, InvalidU64)); - EXPECT_EQ(RefData, toReference(FormValOpt4, InvalidU64)); - EXPECT_EQ(InvalidU64, toAddress(FormValOpt4, InvalidU64)); - EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt4, InvalidU64)); - EXPECT_EQ(InvalidS64, toSigned(FormValOpt4, InvalidU64)); + EXPECT_FALSE(toString(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toUnsigned(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt4Addr).has_value()); + EXPECT_TRUE(toDebugInfoReference(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toSigned(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toAddress(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toSectionOffset(FormValOpt4Addr).has_value()); + EXPECT_FALSE(toBlock(FormValOpt4Addr).has_value()); + EXPECT_EQ(nullptr, toString(FormValOpt4Addr, nullptr)); + EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(RefData, toDebugInfoReference(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidU64, toAddress(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt4Addr, InvalidU64)); + EXPECT_EQ(InvalidS64, toSigned(FormValOpt4Addr, InvalidU64)); + + // Test successful and unsuccessful ref_sig8 decoding. + std::optional FormValOpt4Sig = + DWARFFormValue::createFromUValue(DW_FORM_ref_sig8, RefData); + + EXPECT_FALSE(toString(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toUnsigned(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt4Sig).has_value()); + EXPECT_TRUE(toSignatureReference(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toSigned(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toAddress(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toSectionOffset(FormValOpt4Sig).has_value()); + EXPECT_FALSE(toBlock(FormValOpt4Sig).has_value()); + EXPECT_EQ(nullptr, toString(FormValOpt4Sig, nullptr)); + EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(RefData, toSignatureReference(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidU64, toAddress(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt4Sig, InvalidU64)); + EXPECT_EQ(InvalidS64, toSigned(FormValOpt4Sig, InvalidU64)); + + // Test successful and unsuccessful ref_alt decoding. + // Not testing relative reference forms here, as they require a valid + // DWARFUnit object. + std::optional FormValOpt4Alt = + DWARFFormValue::createFromUValue(DW_FORM_GNU_ref_alt, RefData); + + EXPECT_FALSE(toString(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toUnsigned(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt4Alt).has_value()); + EXPECT_TRUE(toSupplementaryReference(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toSigned(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toAddress(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toSectionOffset(FormValOpt4Alt).has_value()); + EXPECT_FALSE(toBlock(FormValOpt4Alt).has_value()); + EXPECT_EQ(nullptr, toString(FormValOpt4Alt, nullptr)); + EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(RefData, toSupplementaryReference(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidU64, toAddress(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt4Alt, InvalidU64)); + EXPECT_EQ(InvalidS64, toSigned(FormValOpt4Alt, InvalidU64)); // Test successful and unsuccessful signed constant decoding. int64_t SData8 = 0x1020304050607080ULL; @@ -1738,14 +1815,20 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { EXPECT_FALSE(toString(FormValOpt5).has_value()); EXPECT_TRUE(toUnsigned(FormValOpt5).has_value()); - EXPECT_FALSE(toReference(FormValOpt5).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt5).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt5).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt5).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt5).has_value()); EXPECT_TRUE(toSigned(FormValOpt5).has_value()); EXPECT_FALSE(toAddress(FormValOpt5).has_value()); EXPECT_FALSE(toSectionOffset(FormValOpt5).has_value()); EXPECT_FALSE(toBlock(FormValOpt5).has_value()); EXPECT_EQ(nullptr, toString(FormValOpt5, nullptr)); EXPECT_EQ((uint64_t)SData8, toUnsigned(FormValOpt5, InvalidU64)); - EXPECT_EQ(InvalidU64, toReference(FormValOpt5, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt5, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt5, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt5, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt5, InvalidU64)); EXPECT_EQ(InvalidU64, toAddress(FormValOpt5, InvalidU64)); EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt5, InvalidU64)); EXPECT_EQ(SData8, toSigned(FormValOpt5, InvalidU64)); @@ -1758,7 +1841,10 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { EXPECT_FALSE(toString(FormValOpt6).has_value()); EXPECT_FALSE(toUnsigned(FormValOpt6).has_value()); - EXPECT_FALSE(toReference(FormValOpt6).has_value()); + EXPECT_FALSE(toRelativeReference(FormValOpt6).has_value()); + EXPECT_FALSE(toDebugInfoReference(FormValOpt6).has_value()); + EXPECT_FALSE(toSignatureReference(FormValOpt6).has_value()); + EXPECT_FALSE(toSupplementaryReference(FormValOpt6).has_value()); EXPECT_FALSE(toSigned(FormValOpt6).has_value()); EXPECT_FALSE(toAddress(FormValOpt6).has_value()); EXPECT_FALSE(toSectionOffset(FormValOpt6).has_value()); @@ -1767,7 +1853,10 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { EXPECT_EQ(*BlockOpt, Array); EXPECT_EQ(nullptr, toString(FormValOpt6, nullptr)); EXPECT_EQ(InvalidU64, toUnsigned(FormValOpt6, InvalidU64)); - EXPECT_EQ(InvalidU64, toReference(FormValOpt6, InvalidU64)); + EXPECT_EQ(InvalidU64, toRelativeReference(FormValOpt6, InvalidU64)); + EXPECT_EQ(InvalidU64, toDebugInfoReference(FormValOpt6, InvalidU64)); + EXPECT_EQ(InvalidU64, toSignatureReference(FormValOpt6, InvalidU64)); + EXPECT_EQ(InvalidU64, toSupplementaryReference(FormValOpt6, InvalidU64)); EXPECT_EQ(InvalidU64, toAddress(FormValOpt6, InvalidU64)); EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt6, InvalidU64)); EXPECT_EQ(InvalidS64, toSigned(FormValOpt6, InvalidU64)); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 98c0052d878d85..ec68ed0afeb2f7 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -471,3 +471,92 @@ define void @foo(i32 %v1) { } #endif // NDEBUG } + +TEST_F(SandboxIRTest, Instruction) { + parseIR(C, R"IR( +define void @foo(i8 %v1) { + %add0 = add i8 %v1, %v1 + %sub1 = sub i8 %add0, %v1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *Arg = F->getArg(0); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *I0 = &*It++; + auto *I1 = &*It++; + auto *Ret = &*It++; + + // Check getPrevNode(). + EXPECT_EQ(Ret->getPrevNode(), I1); + EXPECT_EQ(I1->getPrevNode(), I0); + EXPECT_EQ(I0->getPrevNode(), nullptr); + + // Check getNextNode(). + EXPECT_EQ(I0->getNextNode(), I1); + EXPECT_EQ(I1->getNextNode(), Ret); + EXPECT_EQ(Ret->getNextNode(), nullptr); + + // Check getIterator(). + EXPECT_EQ(I0->getIterator(), std::next(BB->begin(), 0)); + EXPECT_EQ(I1->getIterator(), std::next(BB->begin(), 1)); + EXPECT_EQ(Ret->getIterator(), std::next(BB->begin(), 2)); + + // Check getOpcode(). + EXPECT_EQ(I0->getOpcode(), sandboxir::Instruction::Opcode::Opaque); + EXPECT_EQ(I1->getOpcode(), sandboxir::Instruction::Opcode::Opaque); + EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Opaque); + + // Check moveBefore(I). + I1->moveBefore(I0); + EXPECT_EQ(I0->getPrevNode(), I1); + EXPECT_EQ(I1->getNextNode(), I0); + + // Check moveAfter(I). + I1->moveAfter(I0); + EXPECT_EQ(I0->getNextNode(), I1); + EXPECT_EQ(I1->getPrevNode(), I0); + + // Check moveBefore(BB, It). + I1->moveBefore(*BB, BB->begin()); + EXPECT_EQ(I1->getPrevNode(), nullptr); + EXPECT_EQ(I1->getNextNode(), I0); + I1->moveBefore(*BB, BB->end()); + EXPECT_EQ(I1->getNextNode(), nullptr); + EXPECT_EQ(Ret->getNextNode(), I1); + I1->moveBefore(*BB, std::next(BB->begin())); + EXPECT_EQ(I0->getNextNode(), I1); + EXPECT_EQ(I1->getNextNode(), Ret); + + // Check removeFromParent(). + I0->removeFromParent(); +#ifndef NDEBUG + EXPECT_DEATH(I0->getPrevNode(), ".*Detached.*"); + EXPECT_DEATH(I0->getNextNode(), ".*Detached.*"); +#endif // NDEBUG + EXPECT_EQ(I0->getParent(), nullptr); + EXPECT_EQ(I1->getPrevNode(), nullptr); + EXPECT_EQ(I0->getOperand(0), Arg); + + // Check insertBefore(). + I0->insertBefore(I1); + EXPECT_EQ(I1->getPrevNode(), I0); + + // Check insertInto(). + I0->removeFromParent(); + I0->insertInto(BB, BB->end()); + EXPECT_EQ(Ret->getNextNode(), I0); + I0->moveBefore(I1); + EXPECT_EQ(I0->getNextNode(), I1); + + // Check eraseFromParent(). +#ifndef NDEBUG + EXPECT_DEATH(I0->eraseFromParent(), "Still connected to users.*"); +#endif + I1->eraseFromParent(); + EXPECT_EQ(I0->getNumUses(), 0u); + EXPECT_EQ(I0->getNextNode(), Ret); +} diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index d50079870f7ac0..e0a848351d06fb 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1026,8 +1026,8 @@ R"(All available -march extensions for RISC-V xwchc 2.2 Experimental extensions - zicfilp 0.4 This is a long dummy description - zicfiss 0.4 + zicfilp 1.0 This is a long dummy description + zicfiss 1.0 zalasr 0.1 smmpm 1.0 smnpm 1.0 @@ -1079,9 +1079,9 @@ R"(Extensions enabled for the given RISC-V target i 2.1 'I' (Base Integer Instruction Set) Experimental extensions - zicfilp 0.4 'Zicfilp' (Landing pad) + zicfilp 1.0 'Zicfilp' (Landing pad) -ISA String: rv64i2p1_zicfilp0p4_zicsr2p0 +ISA String: rv64i2p1_zicfilp1p0_zicsr2p0 )"; // clang-format on diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt index 35055baa05ee99..8a14a5b8e249ef 100644 --- a/llvm/unittests/Transforms/Utils/CMakeLists.txt +++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt @@ -18,6 +18,7 @@ add_llvm_unittest(UtilsTests CodeLayoutTest.cpp CodeMoverUtilsTest.cpp DebugifyTest.cpp + DXILResourceTest.cpp FunctionComparatorTest.cpp IntegerDivisionTest.cpp LocalTest.cpp diff --git a/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp b/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp new file mode 100644 index 00000000000000..1e494cbb7da92e --- /dev/null +++ b/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp @@ -0,0 +1,301 @@ +//===- DXILResourceTest.cpp - Unit tests for DXILResource -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/DXILResource.h" +#include "llvm/IR/Constants.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace dxil; + +namespace { +// Helper to succinctly build resource shaped metadata for tests. +struct MDBuilder { + LLVMContext &Context; + Type *Int32Ty; + Type *Int1Ty; + + MDBuilder(LLVMContext &Context, Type *Int32Ty, Type *Int1Ty) + : Context(Context), Int32Ty(Int32Ty), Int1Ty(Int1Ty) {} + + template + void appendMDs(SmallVectorImpl &MDs, int V, Ts... More) { + MDs.push_back(ConstantAsMetadata::get( + Constant::getIntegerValue(Int32Ty, APInt(32, V)))); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, unsigned int V, Ts... More) { + MDs.push_back(ConstantAsMetadata::get( + Constant::getIntegerValue(Int32Ty, APInt(32, V)))); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, bool V, Ts... More) { + MDs.push_back(ConstantAsMetadata::get( + Constant::getIntegerValue(Int1Ty, APInt(1, V)))); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, Value *V, Ts... More) { + MDs.push_back(ValueAsMetadata::get(V)); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, const char *V, Ts... More) { + MDs.push_back(MDString::get(Context, V)); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, StringRef V, Ts... More) { + MDs.push_back(MDString::get(Context, V)); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, std::nullptr_t V, + Ts... More) { + MDs.push_back(nullptr); + appendMDs(MDs, More...); + } + template + void appendMDs(SmallVectorImpl &MDs, MDTuple *V, Ts... More) { + MDs.push_back(V); + appendMDs(MDs, More...); + } + void appendMDs(SmallVectorImpl &MDs) { + // Base case, nothing to do. + } + + template MDTuple *get(Ts... Data) { + SmallVector MDs; + appendMDs(MDs, Data...); + return MDNode::get(Context, MDs); + } +}; + +testing::AssertionResult MDTupleEq(const char *LHSExpr, const char *RHSExpr, + MDTuple *LHS, MDTuple *RHS) { + if (LHS == RHS) + return testing::AssertionSuccess(); + std::string LHSRepr, RHSRepr; + raw_string_ostream LHSS(LHSRepr), RHSS(RHSRepr); + LHS->printTree(LHSS); + RHS->printTree(RHSS); + + return testing::AssertionFailure() << "Expected equality:\n" + << " " << LHSExpr << "\n" + << "Which is:\n" + << " " << LHSS.str() << "\n\n" + << " " << RHSExpr << "\n" + << "Which is:\n" + << " " << RHSS.str(); +} +#define EXPECT_MDEQ(X, Y) EXPECT_PRED_FORMAT2(MDTupleEq, X, Y) +} // namespace + +TEST(DXILResource, AnnotationsAndMetadata) { + LLVMContext Context; + Type *Int1Ty = Type::getInt1Ty(Context); + Type *Int32Ty = Type::getInt32Ty(Context); + Type *FloatTy = Type::getFloatTy(Context); + Type *DoubleTy = Type::getDoubleTy(Context); + Type *Floatx4Ty = FixedVectorType::get(FloatTy, 4); + Type *Floatx3Ty = FixedVectorType::get(FloatTy, 3); + Type *Int32x2Ty = FixedVectorType::get(Int32Ty, 2); + + MDBuilder TestMD(Context, Int32Ty, Int1Ty); + + // ByteAddressBuffer Buffer0; + Value *Symbol = UndefValue::get( + StructType::create(Context, {Int32Ty}, "struct.ByteAddressBuffer")); + ResourceInfo Resource = + ResourceInfo::RawBuffer(Symbol, "Buffer0", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0); + std::pair Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000000bU); + EXPECT_EQ(Props.second, 0U); + MDTuple *MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "Buffer0", 0, 0, 1, 11, 0, nullptr)); + + // RWByteAddressBuffer BufferOut : register(u3, space2); + Symbol = UndefValue::get( + StructType::create(Context, {Int32Ty}, "struct.RWByteAddressBuffer")); + Resource = ResourceInfo::RWRawBuffer( + Symbol, "BufferOut", ResourceBinding{2, 3, 1}, /*UniqueID=*/1, + /*GloballyCoherent=*/false, /*IsROV=*/false); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000100bU); + EXPECT_EQ(Props.second, 0U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(1, Symbol, "BufferOut", 2, 3, 1, 11, false, false, + false, nullptr)); + + // struct BufType0 { int i; float f; double d; }; + // StructuredBuffer Buffer0 : register(t0); + StructType *BufType0 = + StructType::create(Context, {Int32Ty, FloatTy, DoubleTy}, "BufType0"); + Symbol = UndefValue::get(StructType::create( + Context, {BufType0}, "class.StructuredBuffer")); + Resource = ResourceInfo::StructuredBuffer( + Symbol, "Buffer0", ResourceBinding{0, 0, 1}, /*UniqueID=*/0, + /*Stride=*/16, Align(8)); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000030cU); + EXPECT_EQ(Props.second, 0x00000010U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ( + MD, TestMD.get(0, Symbol, "Buffer0", 0, 0, 1, 12, 0, TestMD.get(1, 16))); + + // Texture2D ColorMapTexture : register(t2); + Symbol = UndefValue::get(StructType::create( + Context, {Floatx4Ty}, "class.Texture2D >")); + Resource = + ResourceInfo::SRV(Symbol, "ColorMapTexture", ResourceBinding{0, 2, 1}, + /*UniqueID=*/2, dxil::ElementType::F32, + /*ElementCount=*/4, dxil::ResourceKind::Texture2D); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00000002U); + EXPECT_EQ(Props.second, 0x00000409U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(2, Symbol, "ColorMapTexture", 0, 2, 1, 2, 0, + TestMD.get(0, 9))); + + // Texture2DMS DepthBuffer : register(t0); + Symbol = UndefValue::get( + StructType::create(Context, {FloatTy}, "class.Texture2DMS")); + Resource = + ResourceInfo::Texture2DMS(Symbol, "DepthBuffer", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, dxil::ElementType::F32, + /*ElementCount=*/1, /*SampleCount=*/8); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00000003U); + EXPECT_EQ(Props.second, 0x00080109U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "DepthBuffer", 0, 0, 1, 3, 8, + TestMD.get(0, 9))); + + // FeedbackTexture2D feedbackMinMip; + Symbol = UndefValue::get( + StructType::create(Context, {Int32Ty}, "class.FeedbackTexture2D<0>")); + Resource = ResourceInfo::FeedbackTexture2D( + Symbol, "feedbackMinMip", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, SamplerFeedbackType::MinMip); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00001011U); + EXPECT_EQ(Props.second, 0U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "feedbackMinMip", 0, 0, 1, 17, false, + false, false, TestMD.get(2, 0))); + + // FeedbackTexture2DArray feedbackMipRegion; + Symbol = UndefValue::get(StructType::create( + Context, {Int32Ty}, "class.FeedbackTexture2DArray<1>")); + Resource = ResourceInfo::FeedbackTexture2DArray( + Symbol, "feedbackMipRegion", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, SamplerFeedbackType::MipRegionUsed); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00001012U); + EXPECT_EQ(Props.second, 0x00000001U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "feedbackMipRegion", 0, 0, 1, 18, false, + false, false, TestMD.get(2, 1))); + + // globallycoherent RWTexture2D OutputTexture : register(u0, space2); + Symbol = UndefValue::get(StructType::create( + Context, {Int32x2Ty}, "class.RWTexture2D >")); + Resource = + ResourceInfo::UAV(Symbol, "OutputTexture", ResourceBinding{2, 0, 1}, + /*UniqueID=*/0, dxil::ElementType::I32, + /*ElementCount=*/2, /*GloballyCoherent=*/1, /*IsROV=*/0, + dxil::ResourceKind::Texture2D); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00005002U); + EXPECT_EQ(Props.second, 0x00000204U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "OutputTexture", 2, 0, 1, 2, true, + false, false, TestMD.get(0, 4))); + + // RasterizerOrderedBuffer ROB; + Symbol = UndefValue::get( + StructType::create(Context, {Floatx4Ty}, + "class.RasterizerOrderedBuffer >")); + Resource = ResourceInfo::UAV(Symbol, "ROB", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, dxil::ElementType::F32, + /*ElementCount=*/4, /*GloballyCoherent=*/0, + /*IsROV=*/1, dxil::ResourceKind::TypedBuffer); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000300aU); + EXPECT_EQ(Props.second, 0x00000409U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "ROB", 0, 0, 1, 10, false, false, true, + TestMD.get(0, 9))); + + // RWStructuredBuffer g_OutputBuffer : register(u2); + StructType *BufType1 = StructType::create( + Context, {Floatx3Ty, FloatTy, Int32Ty}, "ParticleMotion"); + Symbol = UndefValue::get(StructType::create( + Context, {BufType1}, "class.StructuredBuffer")); + Resource = ResourceInfo::RWStructuredBuffer( + Symbol, "g_OutputBuffer", ResourceBinding{0, 2, 1}, + /*UniqueID=*/0, /*Stride=*/20, Align(4), /*GloballyCoherent=*/false, + /*IsROV=*/false, /*HasCounter=*/true); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000920cU); + EXPECT_EQ(Props.second, 0x00000014U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "g_OutputBuffer", 0, 2, 1, 12, false, + true, false, TestMD.get(1, 20))); + + // RWTexture2DMSArray g_rw_t2dmsa; + Symbol = UndefValue::get(StructType::create( + Context, {Int32Ty}, "class.RWTexture2DMSArray")); + Resource = ResourceInfo::RWTexture2DMSArray( + Symbol, "g_rw_t2dmsa", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, dxil::ElementType::U32, /*ElementCount=*/1, + /*SampleCount=*/8, /*GloballyCoherent=*/false); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x00001008U); + EXPECT_EQ(Props.second, 0x00080105U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "g_rw_t2dmsa", 0, 0, 1, 8, false, false, + false, TestMD.get(0, 5))); + + // cbuffer cb0 { float4 g_X; float4 g_Y; } + Symbol = UndefValue::get( + StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0")); + Resource = ResourceInfo::CBuffer(Symbol, "cb0", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, /*Size=*/32); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000000dU); + EXPECT_EQ(Props.second, 0x00000020U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "cb0", 0, 0, 1, 32, nullptr)); + + // SamplerState ColorMapSampler : register(s0); + Symbol = UndefValue::get( + StructType::create(Context, {Int32Ty}, "struct.SamplerState")); + Resource = + ResourceInfo::Sampler(Symbol, "ColorMapSampler", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, dxil::SamplerType::Default); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000000eU); + EXPECT_EQ(Props.second, 0U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, + TestMD.get(0, Symbol, "ColorMapSampler", 0, 0, 1, 0, nullptr)); + + // SamplerComparisonState ShadowSampler {...}; + Resource = + ResourceInfo::Sampler(Symbol, "CmpSampler", ResourceBinding{0, 0, 1}, + /*UniqueID=*/0, dxil::SamplerType::Comparison); + Props = Resource.getAnnotateProps(); + EXPECT_EQ(Props.first, 0x0000800eU); + EXPECT_EQ(Props.second, 0U); + MD = Resource.getAsMetadata(Context); + EXPECT_MDEQ(MD, TestMD.get(0, Symbol, "CmpSampler", 0, 0, 1, 1, nullptr)); +} diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 4022c343d63ef8..d9d6789134d88d 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1133,6 +1133,20 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { EXPECT_FALSE(Recipe.mayReadOrWriteMemory()); } + { + VPValue ChainOp; + VPValue VecOp; + VPValue CondOp; + VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, + &VecOp, false); + VPValue EVL; + VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp); + EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); + EXPECT_FALSE(EVLRecipe.mayReadFromMemory()); + EXPECT_FALSE(EVLRecipe.mayWriteToMemory()); + EXPECT_FALSE(EVLRecipe.mayReadOrWriteMemory()); + } + { auto *Load = new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1)); @@ -1472,6 +1486,21 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { EXPECT_TRUE(isa(BaseR)); } +TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { + LLVMContext C; + + VPValue ChainOp; + VPValue VecOp; + VPValue CondOp; + VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp, + &VecOp, false); + VPValue EVL; + VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp); + EXPECT_TRUE(isa(&EVLRecipe)); + VPRecipeBase *BaseR = &EVLRecipe; + EXPECT_TRUE(isa(BaseR)); +} + struct VPDoubleValueDef : public VPRecipeBase { VPDoubleValueDef(ArrayRef Operands) : VPRecipeBase(99, Operands) { new VPValue(nullptr, this); diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index edddc051c162c4..9e345dceddf528 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -1175,7 +1175,7 @@ class CmpPredicateOperandMatcher : public OperandPredicateMatcher { public: CmpPredicateOperandMatcher(unsigned InsnVarID, unsigned OpIdx, std::string P) : OperandPredicateMatcher(OPM_CmpPredicate, InsnVarID, OpIdx), - PredName(P) {} + PredName(std::move(P)) {} bool isIdentical(const PredicateMatcher &B) const override { return OperandPredicateMatcher::isIdentical(B) && diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index c303322e63b449..f46a8d1e9f081d 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -1877,7 +1877,8 @@ static std::string findOperandDecoderMethod(Record *Record) { } if (Record->isSubClassOf("RegisterOperand")) - Record = Record->getValueAsDef("RegClass"); + // Allows use of a DecoderMethod in referenced RegisterClass if set. + return findOperandDecoderMethod(Record->getValueAsDef("RegClass")); if (Record->isSubClassOf("RegisterClass")) { Decoder = "Decode" + Record->getName().str() + "RegisterClass"; @@ -2268,9 +2269,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, return MCDisassembler::Fail; case MCD::OPC_ExtractField: { // Decode the start value. - unsigned DecodedLen; - unsigned Start = decodeULEB128(++Ptr, &DecodedLen); - Ptr += DecodedLen; + unsigned Start = decodeULEB128AndIncUnsafe(++Ptr); unsigned Len = *Ptr++;)"; if (IsVarLenInst) OS << "\n makeUp(insn, Start + Len);"; @@ -2282,9 +2281,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI, } case MCD::OPC_FilterValue: { // Decode the field value. - unsigned Len; - uint64_t Val = decodeULEB128(++Ptr, &Len); - Ptr += Len; + uint64_t Val = decodeULEB128AndIncUnsafe(++Ptr); // NumToSkip is a plain 24-bit integer. unsigned NumToSkip = *Ptr++; NumToSkip |= (*Ptr++) << 8; diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp index b76ba05954aa51..04e9e0fa48db0a 100644 --- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp @@ -164,7 +164,8 @@ static void emitRISCVProfiles(RecordKeeper &Records, raw_ostream &OS) { static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) { OS << "#ifndef PROC\n" - << "#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)\n" + << "#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_SCALAR_UNALIGN" + << ", FAST_VECTOR_UNALIGN)\n" << "#endif\n\n"; // Iterate on all definition records. @@ -180,9 +181,6 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) { return Feature->getValueAsString("Name") == "unaligned-vector-mem"; }); - bool FastUnalignedAccess = - FastScalarUnalignedAccess && FastVectorUnalignedAccess; - OS << "PROC(" << Rec->getName() << ", {\"" << Rec->getValueAsString("Name") << "\"}, {\""; @@ -193,7 +191,8 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) { printMArch(OS, Features); else OS << MArch; - OS << "\"}, " << FastUnalignedAccess << ")\n"; + OS << "\"}, " << FastScalarUnalignedAccess << ", " + << FastVectorUnalignedAccess << ")\n"; } OS << "\n#undef PROC\n"; OS << "\n"; diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index 950ff1394b9fd8..f967344135553b 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -56,6 +56,8 @@ class X86InstrMappingEmitter { raw_ostream &OS); void emitND2NonNDTable(ArrayRef Insts, raw_ostream &OS); + void emitSSE2AVXTable(ArrayRef Insts, + raw_ostream &OS); // Prints the definition of class X86TableEntry. void printClassDef(raw_ostream &OS); @@ -335,6 +337,38 @@ void X86InstrMappingEmitter::emitND2NonNDTable( printTable(Table, "X86ND2NonNDTable", "GET_X86_ND2NONND_TABLE", OS); } +void X86InstrMappingEmitter::emitSSE2AVXTable( + ArrayRef Insts, raw_ostream &OS) { + + const std::map ManualMap = { +#define ENTRY_SSE2AVX(OLD, NEW) {#OLD, #NEW}, +#include "X86ManualInstrMapping.def" + }; + + std::vector Table; + for (const CodeGenInstruction *Inst : Insts) { + const Record *Rec = Inst->TheDef; + StringRef Name = Rec->getName(); + if (!isInteresting(Rec)) + continue; + if (ManualMap.find(Name) != ManualMap.end()) { + auto *NewRec = Records.getDef(ManualMap.at(Rec->getName())); + assert(NewRec && "Instruction not found!"); + auto &NewInst = Target.getInstruction(NewRec); + Table.push_back(std::pair(Inst, &NewInst)); + continue; + } + + std::string NewName = ("V" + Name).str(); + auto *AVXRec = Records.getDef(NewName); + if (!AVXRec) + continue; + auto &AVXInst = Target.getInstruction(AVXRec); + Table.push_back(std::pair(Inst, &AVXInst)); + } + printTable(Table, "X86SSE2AVXTable", "GET_X86_SSE2AVX_TABLE", OS); +} + void X86InstrMappingEmitter::run(raw_ostream &OS) { emitSourceFileHeader("X86 instruction mapping", OS); @@ -344,6 +378,7 @@ void X86InstrMappingEmitter::run(raw_ostream &OS) { emitCompressEVEXTable(Insts, OS); emitNFTransformTable(Insts, OS); emitND2NonNDTable(Insts, OS); + emitSSE2AVXTable(Insts, OS); } } // namespace diff --git a/llvm/utils/TableGen/X86ManualInstrMapping.def b/llvm/utils/TableGen/X86ManualInstrMapping.def index 364f15607f73dd..58f5449f3b27b7 100644 --- a/llvm/utils/TableGen/X86ManualInstrMapping.def +++ b/llvm/utils/TableGen/X86ManualInstrMapping.def @@ -349,3 +349,14 @@ NOCOMP_ND(CFCMOV64rr_ND) ENTRY_ND(MOVBE32rr, BSWAP32r) ENTRY_ND(MOVBE64rr, BSWAP64r) #undef ENTRY_ND + +#ifndef ENTRY_SSE2AVX +#define ENTRY_SSE2AVX(OLD, NEW) +#endif +ENTRY_SSE2AVX(BLENDVPDrm0, VBLENDVPDrmr) +ENTRY_SSE2AVX(BLENDVPDrr0, VBLENDVPDrrr) +ENTRY_SSE2AVX(BLENDVPSrm0, VBLENDVPSrmr) +ENTRY_SSE2AVX(BLENDVPSrr0, VBLENDVPSrrr) +ENTRY_SSE2AVX(PBLENDVBrm0, VPBLENDVBrmr) +ENTRY_SSE2AVX(PBLENDVBrr0, VPBLENDVBrrr) +#undef ENTRY_SSE2AVX diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index d2cf5243627a04..576ab1db54988d 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -108,6 +108,7 @@ static_library("Basic") { "Targets/DirectX.cpp", "Targets/Hexagon.cpp", "Targets/Lanai.cpp", + "Targets/Le64.cpp", "Targets/LoongArch.cpp", "Targets/M68k.cpp", "Targets/MSP430.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 8f0e27dc2fd91d..ed903559cac845 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -106,6 +106,7 @@ static_library("Sema") { "SemaSystemZ.cpp", "SemaTemplate.cpp", "SemaTemplateDeduction.cpp", + "SemaTemplateDeductionGuide.cpp", "SemaTemplateInstantiate.cpp", "SemaTemplateInstantiateDecl.cpp", "SemaTemplateVariadic.cpp", diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index b0d0a4b601b42b..b976e9745fbef2 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -847,8 +847,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_implicitly_default_constructible.h", "__type_traits/is_integral.h", "__type_traits/is_literal_type.h", - "__type_traits/is_member_function_pointer.h", - "__type_traits/is_member_object_pointer.h", "__type_traits/is_member_pointer.h", "__type_traits/is_nothrow_assignable.h", "__type_traits/is_nothrow_constructible.h", diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn index 8498f9b838e53e..c46a916373ed01 100644 --- a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn @@ -1,4 +1,3 @@ -import("//llvm/utils/gn/build/libs/curl/enable.gni") import("//llvm/utils/gn/build/libs/xml/enable.gni") import("//llvm/utils/gn/build/write_cmake_config.gni") import("libedit.gni") @@ -37,12 +36,6 @@ write_cmake_config("Config") { values += [ "LLDB_ENABLE_LIBEDIT=" ] } - if (llvm_enable_libcurl) { - values += [ "LLVM_ENABLE_CURL=1" ] - } else { - values += [ "LLVM_ENABLE_CURL=" ] - } - if (llvm_enable_libxml2) { values += [ "LLDB_ENABLE_LIBXML2=1" ] } else { diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn index 27d56f91164f7f..ef578ddcd09cf3 100644 --- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn @@ -11,7 +11,10 @@ lldb_tablegen("TargetPropertiesEnum") { static_library("Target") { output_name = "lldbTarget" - configs += [ "//llvm/utils/gn/build:lldb_code" ] + configs += [ + "//llvm/utils/gn/build:clang_code", + "//llvm/utils/gn/build:lldb_code", + ] deps = [ ":TargetProperties", ":TargetPropertiesEnum", @@ -100,5 +103,6 @@ static_library("Target") { "UnixSignals.cpp", "UnwindAssembly.cpp", "UnwindLLDB.cpp", + "VerboseTrapFrameRecognizer.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn index c9bc184926e175..0f6e345b9d1754 100644 --- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn @@ -73,6 +73,7 @@ static_library("IR") { "ProfileSummary.cpp", "PseudoProbe.cpp", "ReplaceConstant.cpp", + "RuntimeLibcalls.cpp", "SSAContext.cpp", "SafepointIRVerifier.cpp", "Statepoint.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn index 89312a42d8816c..12fa720a2efdc6 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/AsmParser/BUILD.gn @@ -10,6 +10,7 @@ static_library("AsmParser") { output_name = "LLVMX86AsmParser" deps = [ ":X86GenAsmMatcher", + "..:X86GenInstrMapping", "//llvm/lib/MC", "//llvm/lib/MC/MCParser", "//llvm/lib/Support", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn index aae0a2aa00d8d0..a9d5328c5d1b37 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -13,7 +13,10 @@ tablegen("X86GenDAGISel") { } tablegen("X86GenInstrMapping") { - visibility = [ ":LLVMX86CodeGen" ] + visibility = [ + ":LLVMX86CodeGen", + "AsmParser", + ] args = [ "-gen-x86-instr-mapping" ] td_file = "X86.td" } diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn index 58acabf85d296b..6ea9c8e4f47488 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn @@ -26,6 +26,7 @@ static_library("Utils") { "CodeMoverUtils.cpp", "CountVisits.cpp", "CtorUtils.cpp", + "DXILResource.cpp", "DXILUpgrade.cpp", "Debugify.cpp", "DemoteRegToStack.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn index 380ed71a2bc010..4a924dab029083 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn @@ -19,6 +19,7 @@ unittest("UtilsTests") { "CodeExtractorTest.cpp", "CodeLayoutTest.cpp", "CodeMoverUtilsTest.cpp", + "DXILResourceTest.cpp", "DebugifyTest.cpp", "FunctionComparatorTest.cpp", "IntegerDivisionTest.cpp", diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md index db26e6477d5fc7..23e74470a835f7 100644 --- a/mlir/docs/DialectConversion.md +++ b/mlir/docs/DialectConversion.md @@ -352,7 +352,8 @@ class TypeConverter { /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of - /// a signature conversion of a single block argument, to a single SSA value. + /// a signature conversion of a single block argument, to a single SSA value + /// with the old argument type. template ::template arg_t<1>> void addArgumentMaterialization(FnT &&callback) { diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h index bed93045f4b50d..d8f2275b615325 100644 --- a/mlir/include/mlir-c/Rewrite.h +++ b/mlir/include/mlir-c/Rewrite.h @@ -33,10 +33,266 @@ extern "C" { }; \ typedef struct name name +DEFINE_C_API_STRUCT(MlirRewriterBase, void); DEFINE_C_API_STRUCT(MlirFrozenRewritePatternSet, void); DEFINE_C_API_STRUCT(MlirGreedyRewriteDriverConfig, void); DEFINE_C_API_STRUCT(MlirRewritePatternSet, void); +//===----------------------------------------------------------------------===// +/// RewriterBase API inherited from OpBuilder +//===----------------------------------------------------------------------===// + +/// Get the MLIR context referenced by the rewriter. +MLIR_CAPI_EXPORTED MlirContext +mlirRewriterBaseGetContext(MlirRewriterBase rewriter); + +//===----------------------------------------------------------------------===// +/// Insertion points methods + +// These do not include functions using Block::iterator or Region::iterator, as +// they are not exposed by the C API yet. Similarly for methods using +// `InsertPoint` directly. + +/// Reset the insertion point to no location. Creating an operation without a +/// set insertion point is an error, but this can still be useful when the +/// current insertion point a builder refers to is being removed. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseClearInsertionPoint(MlirRewriterBase rewriter); + +/// Sets the insertion point to the specified operation, which will cause +/// subsequent insertions to go right before it. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseSetInsertionPointBefore(MlirRewriterBase rewriter, + MlirOperation op); + +/// Sets the insertion point to the node after the specified operation, which +/// will cause subsequent insertions to go right after it. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseSetInsertionPointAfter(MlirRewriterBase rewriter, + MlirOperation op); + +/// Sets the insertion point to the node after the specified value. If value +/// has a defining operation, sets the insertion point to the node after such +/// defining operation. This will cause subsequent insertions to go right +/// after it. Otherwise, value is a BlockArgument. Sets the insertion point to +/// the start of its block. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseSetInsertionPointAfterValue(MlirRewriterBase rewriter, + MlirValue value); + +/// Sets the insertion point to the start of the specified block. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseSetInsertionPointToStart(MlirRewriterBase rewriter, + MlirBlock block); + +/// Sets the insertion point to the end of the specified block. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseSetInsertionPointToEnd(MlirRewriterBase rewriter, + MlirBlock block); + +/// Return the block the current insertion point belongs to. Note that the +/// insertion point is not necessarily the end of the block. +MLIR_CAPI_EXPORTED MlirBlock +mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter); + +/// Returns the current block of the rewriter. +MLIR_CAPI_EXPORTED MlirBlock +mlirRewriterBaseGetBlock(MlirRewriterBase rewriter); + +//===----------------------------------------------------------------------===// +/// Block and operation creation/insertion/cloning + +// These functions do not include the IRMapper, as it is not yet exposed by the +// C API. + +/// Add new block with 'argTypes' arguments and set the insertion point to the +/// end of it. The block is placed before 'insertBefore'. `locs` contains the +/// locations of the inserted arguments, and should match the size of +/// `argTypes`. +MLIR_CAPI_EXPORTED MlirBlock mlirRewriterBaseCreateBlockBefore( + MlirRewriterBase rewriter, MlirBlock insertBefore, intptr_t nArgTypes, + MlirType const *argTypes, MlirLocation const *locations); + +/// Insert the given operation at the current insertion point and return it. +MLIR_CAPI_EXPORTED MlirOperation +mlirRewriterBaseInsert(MlirRewriterBase rewriter, MlirOperation op); + +/// Creates a deep copy of the specified operation. +MLIR_CAPI_EXPORTED MlirOperation +mlirRewriterBaseClone(MlirRewriterBase rewriter, MlirOperation op); + +/// Creates a deep copy of this operation but keep the operation regions +/// empty. +MLIR_CAPI_EXPORTED MlirOperation mlirRewriterBaseCloneWithoutRegions( + MlirRewriterBase rewriter, MlirOperation op); + +/// Clone the blocks that belong to "region" before the given position in +/// another region "parent". +MLIR_CAPI_EXPORTED void +mlirRewriterBaseCloneRegionBefore(MlirRewriterBase rewriter, MlirRegion region, + MlirBlock before); + +//===----------------------------------------------------------------------===// +/// RewriterBase API +//===----------------------------------------------------------------------===// + +/// Move the blocks that belong to "region" before the given position in +/// another region "parent". The two regions must be different. The caller +/// is responsible for creating or updating the operation transferring flow +/// of control to the region and passing it the correct block arguments. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseInlineRegionBefore(MlirRewriterBase rewriter, MlirRegion region, + MlirBlock before); + +/// Replace the results of the given (original) operation with the specified +/// list of values (replacements). The result types of the given op and the +/// replacements must match. The original op is erased. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseReplaceOpWithValues(MlirRewriterBase rewriter, MlirOperation op, + intptr_t nValues, MlirValue const *values); + +/// Replace the results of the given (original) operation with the specified +/// new op (replacement). The result types of the two ops must match. The +/// original op is erased. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseReplaceOpWithOperation(MlirRewriterBase rewriter, + MlirOperation op, MlirOperation newOp); + +/// Erases an operation that is known to have no uses. +MLIR_CAPI_EXPORTED void mlirRewriterBaseEraseOp(MlirRewriterBase rewriter, + MlirOperation op); + +/// Erases a block along with all operations inside it. +MLIR_CAPI_EXPORTED void mlirRewriterBaseEraseBlock(MlirRewriterBase rewriter, + MlirBlock block); + +/// Inline the operations of block 'source' before the operation 'op'. The +/// source block will be deleted and must have no uses. 'argValues' is used to +/// replace the block arguments of 'source' +/// +/// The source block must have no successors. Otherwise, the resulting IR +/// would have unreachable operations. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseInlineBlockBefore(MlirRewriterBase rewriter, MlirBlock source, + MlirOperation op, intptr_t nArgValues, + MlirValue const *argValues); + +/// Inline the operations of block 'source' into the end of block 'dest'. The +/// source block will be deleted and must have no uses. 'argValues' is used to +/// replace the block arguments of 'source' +/// +/// The dest block must have no successors. Otherwise, the resulting IR would +/// have unreachable operation. +MLIR_CAPI_EXPORTED void mlirRewriterBaseMergeBlocks(MlirRewriterBase rewriter, + MlirBlock source, + MlirBlock dest, + intptr_t nArgValues, + MlirValue const *argValues); + +/// Unlink this operation from its current block and insert it right before +/// `existingOp` which may be in the same or another block in the same +/// function. +MLIR_CAPI_EXPORTED void mlirRewriterBaseMoveOpBefore(MlirRewriterBase rewriter, + MlirOperation op, + MlirOperation existingOp); + +/// Unlink this operation from its current block and insert it right after +/// `existingOp` which may be in the same or another block in the same +/// function. +MLIR_CAPI_EXPORTED void mlirRewriterBaseMoveOpAfter(MlirRewriterBase rewriter, + MlirOperation op, + MlirOperation existingOp); + +/// Unlink this block and insert it right before `existingBlock`. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseMoveBlockBefore(MlirRewriterBase rewriter, MlirBlock block, + MlirBlock existingBlock); + +/// This method is used to notify the rewriter that an in-place operation +/// modification is about to happen. A call to this function *must* be +/// followed by a call to either `finalizeOpModification` or +/// `cancelOpModification`. This is a minor efficiency win (it avoids creating +/// a new operation and removing the old one) but also often allows simpler +/// code in the client. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseStartOpModification(MlirRewriterBase rewriter, + MlirOperation op); + +/// This method is used to signal the end of an in-place modification of the +/// given operation. This can only be called on operations that were provided +/// to a call to `startOpModification`. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseFinalizeOpModification(MlirRewriterBase rewriter, + MlirOperation op); + +/// This method cancels a pending in-place modification. This can only be +/// called on operations that were provided to a call to +/// `startOpModification`. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseCancelOpModification(MlirRewriterBase rewriter, + MlirOperation op); + +/// Find uses of `from` and replace them with `to`. Also notify the listener +/// about every in-place op modification (for every use that was replaced). +MLIR_CAPI_EXPORTED void +mlirRewriterBaseReplaceAllUsesWith(MlirRewriterBase rewriter, MlirValue from, + MlirValue to); + +/// Find uses of `from` and replace them with `to`. Also notify the listener +/// about every in-place op modification (for every use that was replaced). +MLIR_CAPI_EXPORTED void mlirRewriterBaseReplaceAllValueRangeUsesWith( + MlirRewriterBase rewriter, intptr_t nValues, MlirValue const *from, + MlirValue const *to); + +/// Find uses of `from` and replace them with `to`. Also notify the listener +/// about every in-place op modification (for every use that was replaced) +/// and that the `from` operation is about to be replaced. +MLIR_CAPI_EXPORTED void +mlirRewriterBaseReplaceAllOpUsesWithValueRange(MlirRewriterBase rewriter, + MlirOperation from, intptr_t nTo, + MlirValue const *to); + +/// Find uses of `from` and replace them with `to`. Also notify the listener +/// about every in-place op modification (for every use that was replaced) +/// and that the `from` operation is about to be replaced. +MLIR_CAPI_EXPORTED void mlirRewriterBaseReplaceAllOpUsesWithOperation( + MlirRewriterBase rewriter, MlirOperation from, MlirOperation to); + +/// Find uses of `from` within `block` and replace them with `to`. Also notify +/// the listener about every in-place op modification (for every use that was +/// replaced). The optional `allUsesReplaced` flag is set to "true" if all +/// uses were replaced. +MLIR_CAPI_EXPORTED void mlirRewriterBaseReplaceOpUsesWithinBlock( + MlirRewriterBase rewriter, MlirOperation op, intptr_t nNewValues, + MlirValue const *newValues, MlirBlock block); + +/// Find uses of `from` and replace them with `to` except if the user is +/// `exceptedUser`. Also notify the listener about every in-place op +/// modification (for every use that was replaced). +MLIR_CAPI_EXPORTED void +mlirRewriterBaseReplaceAllUsesExcept(MlirRewriterBase rewriter, MlirValue from, + MlirValue to, MlirOperation exceptedUser); + +//===----------------------------------------------------------------------===// +/// IRRewriter API +//===----------------------------------------------------------------------===// + +/// Create an IRRewriter and transfer ownership to the caller. +MLIR_CAPI_EXPORTED MlirRewriterBase mlirIRRewriterCreate(MlirContext context); + +/// Create an IRRewriter and transfer ownership to the caller. Additionally +/// set the insertion point before the operation. +MLIR_CAPI_EXPORTED MlirRewriterBase +mlirIRRewriterCreateFromOp(MlirOperation op); + +/// Takes an IRRewriter owned by the caller and destroys it. It is the +/// responsibility of the user to only pass an IRRewriter class. +MLIR_CAPI_EXPORTED void mlirIRRewriterDestroy(MlirRewriterBase rewriter); + +//===----------------------------------------------------------------------===// +/// FrozenRewritePatternSet API +//===----------------------------------------------------------------------===// + MLIR_CAPI_EXPORTED MlirFrozenRewritePatternSet mlirFreezeRewritePattern(MlirRewritePatternSet op); @@ -47,6 +303,10 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily( MlirModule op, MlirFrozenRewritePatternSet patterns, MlirGreedyRewriteDriverConfig); +//===----------------------------------------------------------------------===// +/// PDLPatternModule API +//===----------------------------------------------------------------------===// + #if MLIR_ENABLE_PDL_IN_PATTERNMATCH DEFINE_C_API_STRUCT(MlirPDLPatternModule, void); diff --git a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h index aeac19e827b44f..5a0962df89d37a 100644 --- a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h +++ b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h @@ -36,10 +36,10 @@ namespace presburger { // g_{ij} : Q^n -> Q are affine functionals. class QuasiPolynomial : public PresburgerSpace { public: - QuasiPolynomial(unsigned numVars, SmallVector coeffs = {}, - std::vector>> aff = {}); + QuasiPolynomial(unsigned numVars, ArrayRef coeffs = {}, + ArrayRef>> aff = {}); - QuasiPolynomial(unsigned numVars, Fraction constant); + QuasiPolynomial(unsigned numVars, const Fraction &constant); // Find the number of inputs (numDomain) to the polynomial. // numSymbols is set to zero. @@ -57,7 +57,7 @@ class QuasiPolynomial : public PresburgerSpace { QuasiPolynomial operator+(const QuasiPolynomial &x) const; QuasiPolynomial operator-(const QuasiPolynomial &x) const; QuasiPolynomial operator*(const QuasiPolynomial &x) const; - QuasiPolynomial operator/(const Fraction x) const; + QuasiPolynomial operator/(const Fraction &x) const; // Removes terms which evaluate to zero from the expression // and folds affine functions which are constant into the @@ -77,4 +77,4 @@ class QuasiPolynomial : public PresburgerSpace { } // namespace presburger } // namespace mlir -#endif // MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H \ No newline at end of file +#endif // MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H diff --git a/mlir/include/mlir/Bytecode/BytecodeOpInterface.h b/mlir/include/mlir/Bytecode/BytecodeOpInterface.h index c9c60608fa5b98..98f1fee7f9d3a9 100644 --- a/mlir/include/mlir/Bytecode/BytecodeOpInterface.h +++ b/mlir/include/mlir/Bytecode/BytecodeOpInterface.h @@ -15,8 +15,6 @@ #define MLIR_BYTECODE_BYTECODEOPINTERFACE_H #include "mlir/Bytecode/BytecodeImplementation.h" -#include "mlir/Bytecode/BytecodeReader.h" -#include "mlir/Bytecode/BytecodeWriter.h" #include "mlir/IR/OpDefinition.h" /// Include the generated interface declarations. diff --git a/mlir/include/mlir/Bytecode/BytecodeReaderConfig.h b/mlir/include/mlir/Bytecode/BytecodeReaderConfig.h index 47be732fa3880f..c473b5779130f6 100644 --- a/mlir/include/mlir/Bytecode/BytecodeReaderConfig.h +++ b/mlir/include/mlir/Bytecode/BytecodeReaderConfig.h @@ -1,4 +1,4 @@ -//===- BytecodeReader.h - MLIR Bytecode Reader ------------------*- C++ -*-===// +//===- BytecodeReaderConfig.h - MLIR Bytecode Reader Config -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This header defines interfaces to read MLIR bytecode files/streams. +// This header config for reading MLIR bytecode files/streams. // //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/CAPI/Rewrite.h b/mlir/include/mlir/CAPI/Rewrite.h new file mode 100644 index 00000000000000..0e6dcb24776263 --- /dev/null +++ b/mlir/include/mlir/CAPI/Rewrite.h @@ -0,0 +1,23 @@ +//===- Rewrite.h - C API Utils for Core MLIR classes ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains declarations of implementation details of the C API for +// rewrite patterns. This file should not be included from C++ code other than +// C API implementation nor from C code. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CAPI_REWRITE_H +#define MLIR_CAPI_REWRITE_H + +#include "mlir/CAPI/Wrap.h" +#include "mlir/IR/PatternMatch.h" + +DEFINE_C_API_PTR_METHODS(MlirRewriterBase, mlir::RewriterBase); + +#endif // MLIR_CAPIREWRITER_H diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h index e053e6c97e1430..c12ed7f5d0180b 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h @@ -18,6 +18,9 @@ class FuncOp; namespace bufferization { struct OneShotBufferizationOptions; +/// Maps from symbol table to its corresponding dealloc helper function. +using DeallocHelperMap = llvm::DenseMap; + //===----------------------------------------------------------------------===// // Passes //===----------------------------------------------------------------------===// @@ -46,7 +49,7 @@ std::unique_ptr createLowerDeallocationsPass(); /// Adds the conversion pattern of the `bufferization.dealloc` operation to the /// given pattern set for use in other transformation passes. void populateBufferizationDeallocLoweringPattern( - RewritePatternSet &patterns, func::FuncOp deallocLibraryFunc); + RewritePatternSet &patterns, const DeallocHelperMap &deallocHelperFuncMap); /// Construct the library function needed for the fully generic /// `bufferization.dealloc` lowering implemented in the LowerDeallocations pass. diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 65dfcf93d70294..f0dec69a5032a0 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1449,6 +1449,7 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [ OptionalAttr:$vscale_range, OptionalAttr:$frame_pointer, OptionalAttr:$target_cpu, + OptionalAttr:$tune_cpu, OptionalAttr:$target_features, OptionalAttr:$unsafe_fp_math, OptionalAttr:$no_infs_fp_math, diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index c31b7c4f6c1089..46b3ec0f60ebfa 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -516,7 +516,7 @@ structured_op: !LinalgStructuredOpConfig --- !LinalgOpConfig metadata: !LinalgOpMetadata name: erf - cpp_class_name: erfOp + cpp_class_name: ErfOp doc: |- Applies erf(x) elementwise. diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index aee2937ce5cb7b..69fd1f1f0130fd 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -133,8 +133,12 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [ RecursiveMemoryEffects ], clauses = [ // TODO: Sort clauses alphabetically. - OpenMP_IfClause, OpenMP_NumThreadsClause, OpenMP_AllocateClause, - OpenMP_ReductionClause, OpenMP_ProcBindClause, OpenMP_PrivateClause + OpenMP_IfClauseSkip, + OpenMP_NumThreadsClauseSkip, + OpenMP_AllocateClauseSkip, + OpenMP_ReductionClauseSkip, + OpenMP_ProcBindClauseSkip, + OpenMP_PrivateClauseSkip ], singleRegion = true> { let summary = "parallel construct"; let description = [{ @@ -154,7 +158,8 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [ // TODO: Use default assembly format inherited from OpenMP_Op once printing // and parsing of the parallel region is not intermingled with printing and - // parsing of reduction and private clauses. + // parsing of reduction and private clauses. `assemblyFormat` should also be + // no longer skipped for clauses added to this operation at that time. let assemblyFormat = [{ oilist( `if` `(` $if_expr `)` @@ -363,8 +368,12 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [ ], clauses = [ // TODO: Complete clause list (allocate, private). // TODO: Sort clauses alphabetically. - OpenMP_LinearClause, OpenMP_ReductionClause, OpenMP_ScheduleClause, - OpenMP_NowaitClause, OpenMP_OrderedClause, OpenMP_OrderClause + OpenMP_LinearClauseSkip, + OpenMP_ReductionClauseSkip, + OpenMP_ScheduleClauseSkip, + OpenMP_NowaitClauseSkip, + OpenMP_OrderedClauseSkip, + OpenMP_OrderClauseSkip ], singleRegion = true> { let summary = "worksharing-loop construct"; let description = [{ @@ -398,7 +407,8 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [ // TODO: Use default assembly format inherited from OpenMP_Op once printing // and parsing of the workshare loop region is not intermingled with printing - // and parsing of reduction clauses. + // and parsing of reduction clauses. `assemblyFormat` should also be no longer + // skipped for clauses added to this operation at that time. let assemblyFormat = [{ oilist(`linear` `(` custom($linear_vars, type($linear_vars), diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td index f35ea962bea168..acbcbae105dbfb 100644 --- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td @@ -1159,7 +1159,7 @@ def IndexSwitchOp : SCF_Op<"index_switch", [RecursiveMemoryEffects, Block &getCaseBlock(unsigned idx); }]; - let hasFolder = 1; + let hasCanonicalizer = 1; let hasVerifier = 1; } diff --git a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td index 7bf914f6456ce1..20880d94a83cac 100644 --- a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td +++ b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td @@ -38,6 +38,17 @@ def ApplySCFStructuralConversionPatternsOp : Op]> { + let description = [{ + Collects patterns that lower structured control flow ops to unstructured + control flow. + }]; + + let assemblyFormat = "attr-dict"; +} + def Transform_ScfForOp : Transform_ConcreteOpType<"scf.for">; def ForallToForOp : Op>:$mask, - OptionalAttr:$in_bounds)>, + BoolArrayAttr:$in_bounds)>, Results<(outs AnyVectorOfAnyRank:$vector)> { let summary = "Reads a supervector from memory into an SSA vector value."; @@ -1401,16 +1401,19 @@ def Vector_TransferReadOp : permutation or broadcasting. Elements whose corresponding mask element is `0` are masked out and replaced with `padding`. - An optional boolean array attribute `in_bounds` specifies for every vector - dimension if the transfer is guaranteed to be within the source bounds. If - specified, the `in_bounds` array length has to be equal to the vector rank. - If set to "false", accesses (including the starting point) may run + For every vector dimension, the boolean array attribute `in_bounds` + specifies if the transfer is guaranteed to be within the source bounds. If + set to "false", accesses (including the starting point) may run out-of-bounds along the respective vector dimension as the index increases. - Broadcast dimensions must always be in-bounds. In absence of the attribute, - accesses along all vector dimensions (except for broadcasts) may run - out-of-bounds. A `vector.transfer_read` can be lowered to a simple load if - all dimensions are specified to be within bounds and no `mask` was - specified. Note that non-vector dimensions *must* always be in-bounds. + Non-vector and broadcast dimensions *must* always be in-bounds. The + `in_bounds` array length has to be equal to the vector rank. This attribute + has a default value: `false` (i.e. "out-of-bounds"). When skipped in the + textual IR, the default value is assumed. Similarly, the OP printer will + omit this attribute when all dimensions are out-of-bounds (i.e. the default + value is used). + + A `vector.transfer_read` can be lowered to a simple load if all dimensions + are specified to be within bounds and no `mask` was specified. This operation is called 'read' by opposition to 'load' because the super-vector granularity is generally not representable with a single @@ -1607,7 +1610,7 @@ def Vector_TransferWriteOp : Variadic:$indices, AffineMapAttr:$permutation_map, Optional>:$mask, - OptionalAttr:$in_bounds)>, + BoolArrayAttr:$in_bounds)>, Results<(outs Optional:$result)> { let summary = "The vector.transfer_write op writes a supervector to memory."; @@ -1643,15 +1646,19 @@ def Vector_TransferWriteOp : any permutation. Elements whose corresponding mask element is `0` are masked out. - An optional boolean array attribute `in_bounds` specifies for every vector - dimension if the transfer is guaranteed to be within the source bounds. If - specified, the `in_bounds` array length has to be equal to the vector rank. - If set to "false", accesses (including the starting point) may run + For every vector dimension, the boolean array attribute `in_bounds` + specifies if the transfer is guaranteed to be within the source bounds. If + set to "false", accesses (including the starting point) may run out-of-bounds along the respective vector dimension as the index increases. - In absence of the attribute, accesses along all vector dimensions may run - out-of-bounds. A `vector.transfer_write` can be lowered to a simple store if - all dimensions are specified to be within bounds and no `mask` was - specified. Note that non-vector dimensions *must* always be in-bounds. + Non-vector and broadcast dimensions *must* always be in-bounds. The + `in_bounds` array length has to be equal to the vector rank. This attribute + has a default value: `false` (i.e. "out-of-bounds"). When skipped in the + textual IR, the default value is assumed. Similarly, the OP printer will + omit this attribute when all dimensions are out-of-bounds (i.e. the default + value is used). + + A `vector.transfer_write` can be lowered to a simple store if all + dimensions are specified to be within bounds and no `mask` was specified. This operation is called 'write' by opposition to 'store' because the super-vector granularity is generally not representable with a single diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h index 264c1c8308e789..676da6d1764970 100644 --- a/mlir/include/mlir/IR/AffineMap.h +++ b/mlir/include/mlir/IR/AffineMap.h @@ -146,6 +146,14 @@ class AffineMap { /// affine map (d0, ..., dn) -> (dp, ..., dn) on the most minor dimensions. bool isMinorIdentity() const; + /// Returns the list of broadcast dimensions (i.e. dims indicated by value 0 + /// in the result). + /// Ex: + /// * (d0, d1, d2) -> (0, d1) gives [0] + /// * (d0, d1, d2) -> (d2, d1) gives [] + /// * (d0, d1, d2, d4) -> (d0, 0, d1, 0) gives [1, 3] + SmallVector getBroadcastDims() const; + /// Returns true if this affine map is a minor identity up to broadcasted /// dimensions which are indicated by value 0 in the result. If /// `broadcastedDims` is not null, it will be populated with the indices of diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td index 781d6d3e3f813a..7ea62c2ae2ab13 100644 --- a/mlir/include/mlir/Interfaces/VectorInterfaces.td +++ b/mlir/include/mlir/Interfaces/VectorInterfaces.td @@ -98,7 +98,7 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> { dimension whether it is in-bounds or not. (Broadcast dimensions are always in-bounds). }], - /*retTy=*/"::std::optional<::mlir::ArrayAttr>", + /*retTy=*/"::mlir::ArrayAttr", /*methodName=*/"getInBounds", /*args=*/(ins) >, @@ -240,9 +240,7 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> { bool isDimInBounds(unsigned dim) { if ($_op.isBroadcastDim(dim)) return true; - if (!$_op.getInBounds()) - return false; - auto inBounds = ::llvm::cast<::mlir::ArrayAttr>(*$_op.getInBounds()); + auto inBounds = $_op.getInBounds(); return ::llvm::cast<::mlir::BoolAttr>(inBounds[dim]).getValue(); } diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index a22f198bdf2520..a51b00271f0aeb 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -174,15 +174,15 @@ class TypeConverter { /// where `T` is any subclass of `Type`. This function is responsible for /// creating an operation, using the OpBuilder and Location provided, that /// "casts" a range of values into a single value of the given type `T`. It - /// must return a Value of the converted type on success, an `std::nullopt` if + /// must return a Value of the type `T` on success, an `std::nullopt` if /// it failed but other materialization can be attempted, and `nullptr` on - /// unrecoverable failure. It will only be called for (sub)types of `T`. - /// Materialization functions must be provided when a type conversion may - /// persist after the conversion has finished. + /// unrecoverable failure. Materialization functions must be provided when a + /// type conversion may persist after the conversion has finished. /// This method registers a materialization that will be called when /// converting (potentially multiple) block arguments that were the result of - /// a signature conversion of a single block argument, to a single SSA value. + /// a signature conversion of a single block argument, to a single SSA value + /// with the old block argument type. template >::template arg_t<1>> void addArgumentMaterialization(FnT &&callback) { diff --git a/mlir/lib/Analysis/Presburger/Barvinok.cpp b/mlir/lib/Analysis/Presburger/Barvinok.cpp index dae840e00ff2e2..fad4364391d569 100644 --- a/mlir/lib/Analysis/Presburger/Barvinok.cpp +++ b/mlir/lib/Analysis/Presburger/Barvinok.cpp @@ -361,7 +361,7 @@ mlir::presburger::detail::computePolytopeGeneratingFunction( continue; // If this subset corresponds to a vertex that has not been considered, // store it. - vertices.push_back(*vertex); + vertices.emplace_back(*vertex); // If a vertex is formed by the intersection of more than d facets, we // assume that any d-subset of these facets can be solved to obtain its @@ -472,10 +472,10 @@ mlir::presburger::detail::computePolytopeGeneratingFunction( Point mlir::presburger::detail::getNonOrthogonalVector( ArrayRef vectors) { unsigned dim = vectors[0].size(); - assert( - llvm::all_of(vectors, - [&](const Point &vector) { return vector.size() == dim; }) && - "all vectors need to be the same size!"); + assert(llvm::all_of( + vectors, + [&dim](const Point &vector) { return vector.size() == dim; }) && + "all vectors need to be the same size!"); SmallVector newPoint = {Fraction(1, 1)}; Fraction maxDisallowedValue = -Fraction(1, 0), @@ -493,7 +493,7 @@ Point mlir::presburger::detail::getNonOrthogonalVector( // Find the biggest such value maxDisallowedValue = std::max(maxDisallowedValue, disallowedValue); } - newPoint.push_back(maxDisallowedValue + 1); + newPoint.emplace_back(maxDisallowedValue + 1); } return newPoint; } @@ -519,19 +519,20 @@ QuasiPolynomial mlir::presburger::detail::getCoefficientInRationalFunction( unsigned numParam = num[0].getNumInputs(); // We use the `isEqual` method of PresburgerSpace, which QuasiPolynomial // inherits from. - assert( - llvm::all_of( - num, [&](const QuasiPolynomial &qp) { return num[0].isEqual(qp); }) && - "the quasipolynomials should all belong to the same space!"); + assert(llvm::all_of(num, + [&num](const QuasiPolynomial &qp) { + return num[0].isEqual(qp); + }) && + "the quasipolynomials should all belong to the same space!"); std::vector coefficients; coefficients.reserve(power + 1); - coefficients.push_back(num[0] / den[0]); + coefficients.emplace_back(num[0] / den[0]); for (unsigned i = 1; i <= power; ++i) { // If the power is not there in the numerator, the coefficient is zero. - coefficients.push_back(i < num.size() ? num[i] - : QuasiPolynomial(numParam, 0)); + coefficients.emplace_back(i < num.size() ? num[i] + : QuasiPolynomial(numParam, 0)); // After den.size(), the coefficients are zero, so we stop // subtracting at that point (if it is less than i). @@ -573,7 +574,7 @@ substituteMuInTerm(unsigned numParams, const ParamPoint &v, SmallVector coefficients; coefficients.reserve(numDims); for (const Point &d : ds) - coefficients.push_back(-dotProduct(mu, d)); + coefficients.emplace_back(-dotProduct(mu, d)); // Then, the affine function is a single floor expression, given by the // corresponding column of v. @@ -581,7 +582,7 @@ substituteMuInTerm(unsigned numParams, const ParamPoint &v, std::vector>> affine; affine.reserve(numDims); for (unsigned j = 0; j < numDims; ++j) - affine.push_back({SmallVector(vTranspose.getRow(j))}); + affine.push_back({SmallVector{vTranspose.getRow(j)}}); QuasiPolynomial num(numParams, coefficients, affine); num = num.simplify(); @@ -593,7 +594,7 @@ substituteMuInTerm(unsigned numParams, const ParamPoint &v, for (const Point &d : ds) { // This term in the denominator is // (1 - t^dens.back()) - dens.push_back(dotProduct(d, mu)); + dens.emplace_back(dotProduct(d, mu)); } return {num, dens}; @@ -641,7 +642,7 @@ std::vector getBinomialCoefficients(const QuasiPolynomial &n, coefficients.emplace_back(numParams, 1); for (unsigned j = 1; j <= r; ++j) // We use the recursive formula for binomial coefficients here and below. - coefficients.push_back( + coefficients.emplace_back( (coefficients[j - 1] * (n - QuasiPolynomial(numParams, j - 1)) / Fraction(j, 1)) .simplify()); @@ -656,7 +657,7 @@ std::vector getBinomialCoefficients(const Fraction &n, coefficients.reserve((int64_t)floor(r)); coefficients.emplace_back(1); for (unsigned j = 1; j <= r; ++j) - coefficients.push_back(coefficients[j - 1] * (n - (j - 1)) / (j)); + coefficients.emplace_back(coefficients[j - 1] * (n - (j - 1)) / (j)); return coefficients; } @@ -764,8 +765,8 @@ mlir::presburger::detail::computeNumTerms(const GeneratingFunction &gf) { eachTermDenCoefficients.reserve(r); for (const Fraction &den : dens) { singleTermDenCoefficients = getBinomialCoefficients(den + 1, den + 1); - eachTermDenCoefficients.push_back( - ArrayRef(singleTermDenCoefficients).slice(1)); + eachTermDenCoefficients.emplace_back( + ArrayRef(singleTermDenCoefficients).drop_front()); } // Now we find the coefficients in Q(s) itself diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 095a7dcb287f3c..bdcb55251b1041 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -511,10 +511,10 @@ void IntegerRelation::getLowerAndUpperBoundIndices( continue; if (atIneq(r, pos) >= 1) { // Lower bound. - lbIndices->push_back(r); + lbIndices->emplace_back(r); } else if (atIneq(r, pos) <= -1) { // Upper bound. - ubIndices->push_back(r); + ubIndices->emplace_back(r); } } @@ -528,7 +528,7 @@ void IntegerRelation::getLowerAndUpperBoundIndices( continue; if (containsConstraintDependentOnRange(r, /*isEq=*/true)) continue; - eqIndices->push_back(r); + eqIndices->emplace_back(r); } } @@ -791,7 +791,7 @@ IntMatrix IntegerRelation::getBoundedDirections() const { // processes all the inequalities. for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) { if (simplex.isBoundedAlongConstraint(i)) - boundedIneqs.push_back(i); + boundedIneqs.emplace_back(i); } // The direction vector is given by the coefficients and does not include the @@ -1981,13 +1981,13 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow, for (unsigned r = 0, e = getNumInequalities(); r < e; r++) { if (atIneq(r, pos) == 0) { // Var does not appear in bound. - nbIndices.push_back(r); + nbIndices.emplace_back(r); } else if (atIneq(r, pos) >= 1) { // Lower bound. - lbIndices.push_back(r); + lbIndices.emplace_back(r); } else { // Upper bound. - ubIndices.push_back(r); + ubIndices.emplace_back(r); } } @@ -2028,8 +2028,8 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow, continue; assert(lbCoeff >= 1 && ubCoeff >= 1 && "bounds wrongly identified"); DynamicAPInt lcm = llvm::lcm(lbCoeff, ubCoeff); - ineq.push_back(atIneq(ubPos, l) * (lcm / ubCoeff) + - atIneq(lbPos, l) * (lcm / lbCoeff)); + ineq.emplace_back(atIneq(ubPos, l) * (lcm / ubCoeff) + + atIneq(lbPos, l) * (lcm / lbCoeff)); assert(lcm > 0 && "lcm should be positive!"); if (lcm != 1) allLCMsAreOne = false; @@ -2057,7 +2057,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow, for (unsigned l = 0, e = getNumCols(); l < e; l++) { if (l == pos) continue; - ineq.push_back(atIneq(nbPos, l)); + ineq.emplace_back(atIneq(nbPos, l)); } newRel.addInequality(ineq); } @@ -2072,7 +2072,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow, for (unsigned l = 0, e = getNumCols(); l < e; l++) { if (l == pos) continue; - eq.push_back(atEq(r, l)); + eq.emplace_back(atEq(r, l)); } newRel.addEquality(eq); } @@ -2264,8 +2264,8 @@ IntegerRelation::unionBoundingBox(const IntegerRelation &otherCst) { std::negate()); std::copy(maxUb.begin(), maxUb.end(), newUb.begin() + getNumDimVars()); - boundingLbs.push_back(newLb); - boundingUbs.push_back(newUb); + boundingLbs.emplace_back(newLb); + boundingUbs.emplace_back(newUb); } // Clear all constraints and add the lower/upper bounds for the bounding box. @@ -2309,7 +2309,7 @@ static void getIndependentConstraints(const IntegerRelation &cst, unsigned pos, break; } if (c == pos + num) - nbIneqIndices.push_back(r); + nbIneqIndices.emplace_back(r); } for (unsigned r = 0, e = cst.getNumEqualities(); r < e; r++) { @@ -2320,7 +2320,7 @@ static void getIndependentConstraints(const IntegerRelation &cst, unsigned pos, break; } if (c == pos + num) - nbEqIndices.push_back(r); + nbEqIndices.emplace_back(r); } } diff --git a/mlir/lib/Analysis/Presburger/LinearTransform.cpp b/mlir/lib/Analysis/Presburger/LinearTransform.cpp index cccbf4c9991d3c..1e389ca69e4e8e 100644 --- a/mlir/lib/Analysis/Presburger/LinearTransform.cpp +++ b/mlir/lib/Analysis/Presburger/LinearTransform.cpp @@ -51,7 +51,7 @@ IntegerRelation LinearTransform::applyTo(const IntegerRelation &rel) const { const DynamicAPInt &c = eq.back(); SmallVector newEq = preMultiplyWithRow(eq.drop_back()); - newEq.push_back(c); + newEq.emplace_back(c); result.addEquality(newEq); } @@ -61,7 +61,7 @@ IntegerRelation LinearTransform::applyTo(const IntegerRelation &rel) const { const DynamicAPInt &c = ineq.back(); SmallVector newIneq = preMultiplyWithRow(ineq.drop_back()); - newIneq.push_back(c); + newIneq.emplace_back(c); result.addInequality(newIneq); } diff --git a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp index f78eb7d2d98ceb..beb9f3e82e22d3 100644 --- a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp +++ b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp @@ -46,7 +46,7 @@ static SmallVector subtractExprs(ArrayRef vecA, SmallVector result; result.reserve(vecA.size()); for (unsigned i = 0, e = vecA.size(); i < e; ++i) - result.push_back(vecA[i] - vecB[i]); + result.emplace_back(vecA[i] - vecB[i]); return result; } @@ -78,7 +78,7 @@ MultiAffineFunction::valueAt(ArrayRef point) const { // function of; we have computed one possible set of values and use them here. pointHomogenous.reserve(pointHomogenous.size() + divValues.size()); for (const std::optional &divVal : divValues) - pointHomogenous.push_back(*divVal); + pointHomogenous.emplace_back(*divVal); // The matrix `output` has an affine expression in the ith row, corresponding // to the expression for the ith value in the output vector. The last column // of the matrix contains the constant term. Let v be the input point with @@ -295,7 +295,7 @@ void PWMAFunction::addPiece(const Piece &piece) { assert(piece.isConsistent() && "Piece should be consistent"); assert(piece.domain.intersect(getDomain()).isIntegerEmpty() && "Piece should be disjoint from the function"); - pieces.push_back(piece); + pieces.emplace_back(piece); } void PWMAFunction::print(raw_ostream &os) const { diff --git a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp index e284ca82420bac..239ffe6aaaa764 100644 --- a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp @@ -79,7 +79,7 @@ const IntegerRelation &PresburgerRelation::getDisjunct(unsigned index) const { /// IntegerRelation. void PresburgerRelation::unionInPlace(const IntegerRelation &disjunct) { assert(space.isCompatible(disjunct.getSpace()) && "Spaces should match"); - disjuncts.push_back(disjunct); + disjuncts.emplace_back(disjunct); } /// Mutate this set, turning it into the union of this set and the given set. @@ -121,8 +121,8 @@ PresburgerRelation::unionSet(const PresburgerRelation &set) const { /// A point is contained in the union iff any of the parts contain the point. bool PresburgerRelation::containsPoint(ArrayRef point) const { - return llvm::any_of(disjuncts, [&](const IntegerRelation &disjunct) { - return (disjunct.containsPointNoLocal(point)); + return llvm::any_of(disjuncts, [&point](const IntegerRelation &disjunct) { + return disjunct.containsPointNoLocal(point); }); } @@ -376,6 +376,15 @@ static PresburgerRelation getSetDifference(IntegerRelation b, // The index of the last inequality that was processed at this level. // This is empty when we are coming to this level for the first time. std::optional lastIneqProcessed; + + // Convenience constructor. + Frame(unsigned simplexSnapshot, + const IntegerRelation::CountsSnapshot &bCounts, + const IntegerRelation &sI, ArrayRef ineqsToProcess = {}, + std::optional lastIneqProcessed = std::nullopt) + : simplexSnapshot(simplexSnapshot), bCounts(bCounts), sI(sI), + ineqsToProcess(ineqsToProcess), lastIneqProcessed(lastIneqProcessed) { + } }; SmallVector frames; @@ -489,9 +498,7 @@ static PresburgerRelation getSetDifference(IntegerRelation b, // // TODO: consider supporting tail recursion directly if this becomes // relevant for performance. - frames.push_back(Frame{initialSnapshot, initBCounts, sI, - /*ineqsToProcess=*/{}, - /*lastIneqProcessed=*/{}}); + frames.emplace_back(Frame{initialSnapshot, initBCounts, sI}); ++level; continue; } @@ -521,7 +528,7 @@ static PresburgerRelation getSetDifference(IntegerRelation b, ineqsToProcess.reserve(totalNewSimplexInequalities); for (unsigned i = 0; i < totalNewSimplexInequalities; ++i) if (!canIgnoreIneq[i]) - ineqsToProcess.push_back(i); + ineqsToProcess.emplace_back(i); if (ineqsToProcess.empty()) { // Nothing to process; return. (we have no frame to pop.) @@ -531,8 +538,7 @@ static PresburgerRelation getSetDifference(IntegerRelation b, unsigned simplexSnapshot = simplex.getSnapshot(); IntegerRelation::CountsSnapshot bCounts = b.getCounts(); - frames.push_back(Frame{simplexSnapshot, bCounts, sI, ineqsToProcess, - /*lastIneqProcessed=*/std::nullopt}); + frames.emplace_back(Frame{simplexSnapshot, bCounts, sI, ineqsToProcess}); // We have completed the initial setup for this level. // Fallthrough to the main recursive part below. } @@ -796,7 +802,7 @@ SetCoalescer::SetCoalescer(const PresburgerRelation &s) : space(s.getSpace()) { continue; } ++i; - simplices.push_back(simp); + simplices.emplace_back(simp); } } @@ -928,9 +934,9 @@ LogicalResult SetCoalescer::typeInequality(ArrayRef ineq, Simplex &simp) { Simplex::IneqType type = simp.findIneqType(ineq); if (type == Simplex::IneqType::Redundant) - redundantIneqsB.push_back(ineq); + redundantIneqsB.emplace_back(ineq); else if (type == Simplex::IneqType::Cut) - cuttingIneqsB.push_back(ineq); + cuttingIneqsB.emplace_back(ineq); else return failure(); return success(); @@ -940,7 +946,7 @@ LogicalResult SetCoalescer::typeEquality(ArrayRef eq, Simplex &simp) { if (typeInequality(eq, simp).failed()) return failure(); - negEqs.push_back(getNegatedCoeffs(eq)); + negEqs.emplace_back(getNegatedCoeffs(eq)); ArrayRef inv(negEqs.back()); return typeInequality(inv, simp); } @@ -1038,7 +1044,7 @@ PresburgerRelation PresburgerRelation::simplify() const { } bool PresburgerRelation::isFullDim() const { - return llvm::any_of(getAllDisjuncts(), [&](IntegerRelation disjunct) { + return llvm::any_of(getAllDisjuncts(), [](IntegerRelation disjunct) { return disjunct.isFullDim(); }); } diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp index 85cb56e8a11366..940a28f0ca0068 100644 --- a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp +++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp @@ -14,8 +14,8 @@ using namespace mlir; using namespace presburger; QuasiPolynomial::QuasiPolynomial( - unsigned numVars, SmallVector coeffs, - std::vector>> aff) + unsigned numVars, ArrayRef coeffs, + ArrayRef>> aff) : PresburgerSpace(/*numDomain=*/numVars, /*numRange=*/1, /*numSymbols=*/0, /*numLocals=*/0), coefficients(coeffs), affine(aff) { @@ -36,7 +36,7 @@ QuasiPolynomial::QuasiPolynomial( } /// Define a quasipolynomial which is a single constant. -QuasiPolynomial::QuasiPolynomial(unsigned numVars, Fraction constant) +QuasiPolynomial::QuasiPolynomial(unsigned numVars, const Fraction &constant) : PresburgerSpace(/*numDomain=*/numVars, /*numRange=*/1, /*numSymbols=*/0, /*numLocals=*/0), coefficients({constant}), affine({{}}) {} @@ -71,7 +71,7 @@ QuasiPolynomial QuasiPolynomial::operator*(const QuasiPolynomial &x) const { coeffs.reserve(coefficients.size() * x.coefficients.size()); for (const Fraction &coeff : coefficients) for (const Fraction &xcoeff : x.coefficients) - coeffs.push_back(coeff * xcoeff); + coeffs.emplace_back(coeff * xcoeff); std::vector> product; std::vector>> aff; @@ -81,14 +81,14 @@ QuasiPolynomial QuasiPolynomial::operator*(const QuasiPolynomial &x) const { product.clear(); product.insert(product.end(), term.begin(), term.end()); product.insert(product.end(), xterm.begin(), xterm.end()); - aff.push_back(product); + aff.emplace_back(product); } } return QuasiPolynomial(getNumInputs(), coeffs, aff); } -QuasiPolynomial QuasiPolynomial::operator/(const Fraction x) const { +QuasiPolynomial QuasiPolynomial::operator/(const Fraction &x) const { assert(x != 0 && "division by zero!"); QuasiPolynomial qp(*this); for (Fraction &coeff : qp.coefficients) @@ -130,15 +130,15 @@ QuasiPolynomial QuasiPolynomial::simplify() { newCoeff = coefficients[i]; for (ArrayRef term : affine[i]) { bool allCoeffsZero = llvm::all_of( - term.slice(0, numParam), [](const Fraction c) { return c == 0; }); + term.slice(0, numParam), [](const Fraction &c) { return c == 0; }); if (allCoeffsZero) newCoeff *= term[numParam]; else - newAffineTerm.push_back(SmallVector(term)); + newAffineTerm.emplace_back(term); } - newCoeffs.push_back(newCoeff); - newAffine.push_back(newAffineTerm); + newCoeffs.emplace_back(newCoeff); + newAffine.emplace_back(newAffineTerm); } return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine); } @@ -157,8 +157,8 @@ QuasiPolynomial QuasiPolynomial::collectTerms() { } if (alreadyPresent) continue; - newCoeffs.push_back(coefficients[i]); - newAffine.push_back(affine[i]); + newCoeffs.emplace_back(coefficients[i]); + newAffine.emplace_back(affine[i]); } return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine); @@ -167,7 +167,7 @@ QuasiPolynomial QuasiPolynomial::collectTerms() { Fraction QuasiPolynomial::getConstantTerm() { Fraction constTerm = 0; for (unsigned i = 0, e = coefficients.size(); i < e; ++i) - if (affine[i].size() == 0) + if (affine[i].empty()) constTerm += coefficients[i]; return constTerm; } diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp index bebbf0325f430c..7c8a019557132a 100644 --- a/mlir/lib/Analysis/Presburger/Simplex.cpp +++ b/mlir/lib/Analysis/Presburger/Simplex.cpp @@ -12,6 +12,7 @@ #include "mlir/Analysis/Presburger/Matrix.h" #include "mlir/Analysis/Presburger/PresburgerSpace.h" #include "mlir/Analysis/Presburger/Utils.h" +#include "llvm/ADT/DynamicAPInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" @@ -42,18 +43,20 @@ scaleAndAddForAssert(ArrayRef a, const DynamicAPInt &scale, SmallVector res; res.reserve(a.size()); for (unsigned i = 0, e = a.size(); i < e; ++i) - res.push_back(a[i] + scale * b[i]); + res.emplace_back(a[i] + scale * b[i]); return res; } SimplexBase::SimplexBase(unsigned nVar, bool mustUseBigM) : usingBigM(mustUseBigM), nRedundant(0), nSymbol(0), tableau(0, getNumFixedCols() + nVar), empty(false) { + var.reserve(nVar); + colUnknown.reserve(nVar + 1); colUnknown.insert(colUnknown.begin(), getNumFixedCols(), nullIndex); for (unsigned i = 0; i < nVar; ++i) { var.emplace_back(Orientation::Column, /*restricted=*/false, /*pos=*/getNumFixedCols() + i); - colUnknown.push_back(i); + colUnknown.emplace_back(i); } } @@ -105,9 +108,9 @@ unsigned SimplexBase::addZeroRow(bool makeRestricted) { // Resize the tableau to accommodate the extra row. unsigned newRow = tableau.appendExtraRow(); assert(getNumRows() == getNumRows() && "Inconsistent tableau size"); - rowUnknown.push_back(~con.size()); + rowUnknown.emplace_back(~con.size()); con.emplace_back(Orientation::Row, makeRestricted, newRow); - undoLog.push_back(UndoLogEntry::RemoveLastConstraint); + undoLog.emplace_back(UndoLogEntry::RemoveLastConstraint); tableau(newRow, 0) = 1; return newRow; } @@ -346,8 +349,8 @@ SymbolicLexSimplex::getSymbolicSampleNumerator(unsigned row) const { SmallVector sample; sample.reserve(nSymbol + 1); for (unsigned col = 3; col < 3 + nSymbol; ++col) - sample.push_back(tableau(row, col)); - sample.push_back(tableau(row, 1)); + sample.emplace_back(tableau(row, col)); + sample.emplace_back(tableau(row, 1)); return sample; } @@ -426,8 +429,8 @@ LogicalResult SymbolicLexSimplex::addSymbolicCut(unsigned row) { divCoeffs.reserve(nSymbol + 1); DynamicAPInt divDenom = d; for (unsigned col = 3; col < 3 + nSymbol; ++col) - divCoeffs.push_back(mod(-tableau(row, col), divDenom)); // (-a_i%d)s_i - divCoeffs.push_back(mod(-tableau(row, 1), divDenom)); // -c%d. + divCoeffs.emplace_back(mod(-tableau(row, col), divDenom)); // (-a_i%d)s_i + divCoeffs.emplace_back(mod(-tableau(row, 1), divDenom)); // -c%d. normalizeDiv(divCoeffs, divDenom); domainSimplex.addDivisionVariable(divCoeffs, divDenom); @@ -619,8 +622,8 @@ SymbolicLexOpt SymbolicLexSimplex::computeSymbolicIntegerLexMin() { // reallocated. int splitIndex = rowUnknown[splitRow]; unsigned snapshot = getSnapshot(); - stack.push_back( - {splitIndex, snapshot, domainSnapshot, domainPolyCounts}); + stack.emplace_back( + StackFrame{splitIndex, snapshot, domainSnapshot, domainPolyCounts}); ++level; continue; } @@ -1093,7 +1096,7 @@ void SimplexBase::markEmpty() { // non-empty when rolling back past this point. if (empty) return; - undoLog.push_back(UndoLogEntry::UnmarkEmpty); + undoLog.emplace_back(UndoLogEntry::UnmarkEmpty); empty = true; } @@ -1120,6 +1123,7 @@ void Simplex::addInequality(ArrayRef coeffs) { void SimplexBase::addEquality(ArrayRef coeffs) { addInequality(coeffs); SmallVector negatedCoeffs; + negatedCoeffs.reserve(coeffs.size()); for (const DynamicAPInt &coeff : coeffs) negatedCoeffs.emplace_back(-coeff); addInequality(negatedCoeffs); @@ -1134,11 +1138,12 @@ unsigned SimplexBase::getSnapshot() const { return undoLog.size(); } unsigned SimplexBase::getSnapshotBasis() { SmallVector basis; + basis.reserve(colUnknown.size()); for (int index : colUnknown) { if (index != nullIndex) - basis.push_back(index); + basis.emplace_back(index); } - savedBases.push_back(std::move(basis)); + savedBases.emplace_back(std::move(basis)); undoLog.emplace_back(UndoLogEntry::RestoreBasis); return undoLog.size() - 1; @@ -1304,7 +1309,7 @@ void SimplexBase::addDivisionVariable(ArrayRef coeffs, SmallVector ineq(coeffs.begin(), coeffs.end()); DynamicAPInt constTerm = ineq.back(); ineq.back() = -denom; - ineq.push_back(constTerm); + ineq.emplace_back(constTerm); addInequality(ineq); for (DynamicAPInt &coeff : ineq) @@ -1321,7 +1326,7 @@ void SimplexBase::appendVariable(unsigned count) { for (unsigned i = 0; i < count; ++i) { var.emplace_back(Orientation::Column, /*restricted=*/false, /*pos=*/getNumColumns() + i); - colUnknown.push_back(var.size() - 1); + colUnknown.emplace_back(var.size() - 1); } tableau.resizeHorizontally(getNumColumns() + count); undoLog.insert(undoLog.end(), count, UndoLogEntry::RemoveLastVariable); @@ -1516,12 +1521,12 @@ Simplex Simplex::makeProduct(const Simplex &a, const Simplex &b) { result.colUnknown.assign(2, nullIndex); for (unsigned i = 2, e = a.getNumColumns(); i < e; ++i) { - result.colUnknown.push_back(a.colUnknown[i]); + result.colUnknown.emplace_back(a.colUnknown[i]); result.unknownFromIndex(result.colUnknown.back()).pos = result.colUnknown.size() - 1; } for (unsigned i = 2, e = b.getNumColumns(); i < e; ++i) { - result.colUnknown.push_back(indexFromBIndex(b.colUnknown[i])); + result.colUnknown.emplace_back(indexFromBIndex(b.colUnknown[i])); result.unknownFromIndex(result.colUnknown.back()).pos = result.colUnknown.size() - 1; } @@ -1530,7 +1535,7 @@ Simplex Simplex::makeProduct(const Simplex &a, const Simplex &b) { unsigned resultRow = result.tableau.appendExtraRow(); for (unsigned col = 0, e = a.getNumColumns(); col < e; ++col) result.tableau(resultRow, col) = a.tableau(row, col); - result.rowUnknown.push_back(a.rowUnknown[row]); + result.rowUnknown.emplace_back(a.rowUnknown[row]); result.unknownFromIndex(result.rowUnknown.back()).pos = result.rowUnknown.size() - 1; }; @@ -1545,7 +1550,7 @@ Simplex Simplex::makeProduct(const Simplex &a, const Simplex &b) { unsigned offset = a.getNumColumns() - 2; for (unsigned col = 2, e = b.getNumColumns(); col < e; ++col) result.tableau(resultRow, offset + col) = b.tableau(row, col); - result.rowUnknown.push_back(indexFromBIndex(b.rowUnknown[row])); + result.rowUnknown.emplace_back(indexFromBIndex(b.rowUnknown[row])); result.unknownFromIndex(result.rowUnknown.back()).pos = result.rowUnknown.size() - 1; }; @@ -1632,7 +1637,7 @@ Simplex::getSamplePointIfIntegral() const { // If the sample is non-integral, return std::nullopt. if (coord.num % coord.den != 0) return {}; - integerSample.push_back(coord.num / coord.den); + integerSample.emplace_back(coord.num / coord.den); } return integerSample; } @@ -1661,7 +1666,7 @@ class presburger::GBRSimplex { void addEqualityForDirection(ArrayRef dir) { assert(llvm::any_of(dir, [](const DynamicAPInt &x) { return x != 0; }) && "Direction passed is the zero vector!"); - snapshotStack.push_back(simplex.getSnapshot()); + snapshotStack.emplace_back(simplex.getSnapshot()); simplex.addEquality(getCoeffsForDirection(dir)); } /// Compute max(dotProduct(dir, x - y)). @@ -1691,6 +1696,7 @@ class presburger::GBRSimplex { assert(maybeWidth.isBounded() && "Width should be bounded!"); dualDenom = simplex.tableau(row, 0); dual.clear(); + dual.reserve((conIndex - simplexConstraintOffset) / 2); // The increment is i += 2 because equalities are added as two inequalities, // one positive and one negative. Each iteration processes one equality. @@ -1715,14 +1721,14 @@ class presburger::GBRSimplex { // Note that it is NOT valid to perform pivots during the computation of // the duals. This entire dual computation must be performed on the same // tableau configuration. - assert(!(simplex.con[i].orientation == Orientation::Column && - simplex.con[i + 1].orientation == Orientation::Column) && + assert((simplex.con[i].orientation != Orientation::Column || + simplex.con[i + 1].orientation != Orientation::Column) && "Both inequalities for the equality cannot be in column " "orientation!"); if (simplex.con[i].orientation == Orientation::Column) - dual.push_back(-simplex.tableau(row, simplex.con[i].pos)); + dual.emplace_back(-simplex.tableau(row, simplex.con[i].pos)); else if (simplex.con[i + 1].orientation == Orientation::Column) - dual.push_back(simplex.tableau(row, simplex.con[i + 1].pos)); + dual.emplace_back(simplex.tableau(row, simplex.con[i + 1].pos)); else dual.emplace_back(0); } @@ -1749,9 +1755,9 @@ class presburger::GBRSimplex { assert(2 * dir.size() == simplex.getNumVariables() && "Direction vector has wrong dimensionality"); SmallVector coeffs(dir.begin(), dir.end()); - coeffs.reserve(2 * dir.size()); + coeffs.reserve(dir.size() + 1); for (const DynamicAPInt &coeff : dir) - coeffs.push_back(-coeff); + coeffs.emplace_back(-coeff); coeffs.emplace_back(0); // constant term return coeffs; } @@ -1921,7 +1927,7 @@ void Simplex::reduceBasis(IntMatrix &basis, unsigned level) { // because this case should only occur when i is level, and there are no // duals in that case anyway. assert(i == level && "This case should only occur when i == level"); - width.push_back( + width.emplace_back( gbrSimplex.computeWidthAndDuals(basis.getRow(i), dual, dualDenom)); } @@ -1930,8 +1936,8 @@ void Simplex::reduceBasis(IntMatrix &basis, unsigned level) { "We don't know dual_i but we know width_{i+1}"); // We don't know dual for our level, so let's find it. gbrSimplex.addEqualityForDirection(basis.getRow(i)); - width.push_back(gbrSimplex.computeWidthAndDuals(basis.getRow(i + 1), dual, - dualDenom)); + width.emplace_back(gbrSimplex.computeWidthAndDuals(basis.getRow(i + 1), + dual, dualDenom)); gbrSimplex.removeLastEquality(); } @@ -2056,12 +2062,12 @@ std::optional> Simplex::findIntegerSample() { computeIntegerBounds(basisCoeffs); } - snapshotStack.push_back(getSnapshot()); + snapshotStack.emplace_back(getSnapshot()); // The smallest value in the range is the next value to try. // The values in the optionals are guaranteed to exist since we know the // polytope is bounded. - nextValueStack.push_back(*minRoundedUp); - upperBoundStack.push_back(*maxRoundedDown); + nextValueStack.emplace_back(*minRoundedUp); + upperBoundStack.emplace_back(*maxRoundedDown); } assert((snapshotStack.size() - 1 == level && @@ -2088,7 +2094,7 @@ std::optional> Simplex::findIntegerSample() { // Try the next value in the range and "recurse" into the next level. SmallVector basisCoeffs(basis.getRow(level).begin(), basis.getRow(level).end()); - basisCoeffs.push_back(-nextValue); + basisCoeffs.emplace_back(-nextValue); addEquality(basisCoeffs); level++; } diff --git a/mlir/lib/Analysis/Presburger/Utils.cpp b/mlir/lib/Analysis/Presburger/Utils.cpp index 383888c3b5660e..e74aae70796802 100644 --- a/mlir/lib/Analysis/Presburger/Utils.cpp +++ b/mlir/lib/Analysis/Presburger/Utils.cpp @@ -33,7 +33,7 @@ using llvm::dynamicAPIntFromInt64; static void normalizeDivisionByGCD(MutableArrayRef dividend, DynamicAPInt &divisor) { assert(divisor > 0 && "divisor must be non-negative!"); - if (divisor == 0 || dividend.empty()) + if (dividend.empty()) return; // We take the absolute value of dividend's coefficients to make sure that // `gcd` is positive. @@ -556,8 +556,7 @@ std::vector presburger::multiplyPolynomials(ArrayRef a, auto getCoeff = [](ArrayRef arr, unsigned i) -> Fraction { if (i < arr.size()) return arr[i]; - else - return 0; + return 0; }; std::vector convolution; @@ -566,11 +565,11 @@ std::vector presburger::multiplyPolynomials(ArrayRef a, Fraction sum(0, 1); for (unsigned l = 0; l <= k; ++l) sum += getCoeff(a, l) * getCoeff(b, k - l); - convolution.push_back(sum); + convolution.emplace_back(sum); } return convolution; } bool presburger::isRangeZero(ArrayRef arr) { - return llvm::all_of(arr, [&](Fraction f) { return f == 0; }); + return llvm::all_of(arr, [](const Fraction &f) { return f == 0; }); } diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp index d2144dd7f33483..1bc02e17215732 100644 --- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp +++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp @@ -9,6 +9,7 @@ #include "IRNumbering.h" #include "mlir/Bytecode/BytecodeImplementation.h" #include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/Bytecode/BytecodeWriter.h" #include "mlir/Bytecode/Encoding.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinTypes.h" diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index 0de1958398f63e..379f09cf5cc26a 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -7,15 +7,254 @@ //===----------------------------------------------------------------------===// #include "mlir-c/Rewrite.h" + #include "mlir-c/Transforms.h" #include "mlir/CAPI/IR.h" +#include "mlir/CAPI/Rewrite.h" #include "mlir/CAPI/Support.h" +#include "mlir/CAPI/Wrap.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Rewrite/FrozenRewritePatternSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" using namespace mlir; +//===----------------------------------------------------------------------===// +/// RewriterBase API inherited from OpBuilder +//===----------------------------------------------------------------------===// + +MlirContext mlirRewriterBaseGetContext(MlirRewriterBase rewriter) { + return wrap(unwrap(rewriter)->getContext()); +} + +//===----------------------------------------------------------------------===// +/// Insertion points methods + +void mlirRewriterBaseClearInsertionPoint(MlirRewriterBase rewriter) { + unwrap(rewriter)->clearInsertionPoint(); +} + +void mlirRewriterBaseSetInsertionPointBefore(MlirRewriterBase rewriter, + MlirOperation op) { + unwrap(rewriter)->setInsertionPoint(unwrap(op)); +} + +void mlirRewriterBaseSetInsertionPointAfter(MlirRewriterBase rewriter, + MlirOperation op) { + unwrap(rewriter)->setInsertionPointAfter(unwrap(op)); +} + +void mlirRewriterBaseSetInsertionPointAfterValue(MlirRewriterBase rewriter, + MlirValue value) { + unwrap(rewriter)->setInsertionPointAfterValue(unwrap(value)); +} + +void mlirRewriterBaseSetInsertionPointToStart(MlirRewriterBase rewriter, + MlirBlock block) { + unwrap(rewriter)->setInsertionPointToStart(unwrap(block)); +} + +void mlirRewriterBaseSetInsertionPointToEnd(MlirRewriterBase rewriter, + MlirBlock block) { + unwrap(rewriter)->setInsertionPointToEnd(unwrap(block)); +} + +MlirBlock mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter) { + return wrap(unwrap(rewriter)->getInsertionBlock()); +} + +MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) { + return wrap(unwrap(rewriter)->getBlock()); +} + +//===----------------------------------------------------------------------===// +/// Block and operation creation/insertion/cloning + +MlirBlock mlirRewriterBaseCreateBlockBefore(MlirRewriterBase rewriter, + MlirBlock insertBefore, + intptr_t nArgTypes, + MlirType const *argTypes, + MlirLocation const *locations) { + SmallVector args; + ArrayRef unwrappedArgs = unwrapList(nArgTypes, argTypes, args); + SmallVector locs; + ArrayRef unwrappedLocs = unwrapList(nArgTypes, locations, locs); + return wrap(unwrap(rewriter)->createBlock(unwrap(insertBefore), unwrappedArgs, + unwrappedLocs)); +} + +MlirOperation mlirRewriterBaseInsert(MlirRewriterBase rewriter, + MlirOperation op) { + return wrap(unwrap(rewriter)->insert(unwrap(op))); +} + +// Other methods of OpBuilder + +MlirOperation mlirRewriterBaseClone(MlirRewriterBase rewriter, + MlirOperation op) { + return wrap(unwrap(rewriter)->clone(*unwrap(op))); +} + +MlirOperation mlirRewriterBaseCloneWithoutRegions(MlirRewriterBase rewriter, + MlirOperation op) { + return wrap(unwrap(rewriter)->cloneWithoutRegions(*unwrap(op))); +} + +void mlirRewriterBaseCloneRegionBefore(MlirRewriterBase rewriter, + MlirRegion region, MlirBlock before) { + + unwrap(rewriter)->cloneRegionBefore(*unwrap(region), unwrap(before)); +} + +//===----------------------------------------------------------------------===// +/// RewriterBase API +//===----------------------------------------------------------------------===// + +void mlirRewriterBaseInlineRegionBefore(MlirRewriterBase rewriter, + MlirRegion region, MlirBlock before) { + unwrap(rewriter)->inlineRegionBefore(*unwrap(region), unwrap(before)); +} + +void mlirRewriterBaseReplaceOpWithValues(MlirRewriterBase rewriter, + MlirOperation op, intptr_t nValues, + MlirValue const *values) { + SmallVector vals; + ArrayRef unwrappedVals = unwrapList(nValues, values, vals); + unwrap(rewriter)->replaceOp(unwrap(op), unwrappedVals); +} + +void mlirRewriterBaseReplaceOpWithOperation(MlirRewriterBase rewriter, + MlirOperation op, + MlirOperation newOp) { + unwrap(rewriter)->replaceOp(unwrap(op), unwrap(newOp)); +} + +void mlirRewriterBaseEraseOp(MlirRewriterBase rewriter, MlirOperation op) { + unwrap(rewriter)->eraseOp(unwrap(op)); +} + +void mlirRewriterBaseEraseBlock(MlirRewriterBase rewriter, MlirBlock block) { + unwrap(rewriter)->eraseBlock(unwrap(block)); +} + +void mlirRewriterBaseInlineBlockBefore(MlirRewriterBase rewriter, + MlirBlock source, MlirOperation op, + intptr_t nArgValues, + MlirValue const *argValues) { + SmallVector vals; + ArrayRef unwrappedVals = unwrapList(nArgValues, argValues, vals); + + unwrap(rewriter)->inlineBlockBefore(unwrap(source), unwrap(op), + unwrappedVals); +} + +void mlirRewriterBaseMergeBlocks(MlirRewriterBase rewriter, MlirBlock source, + MlirBlock dest, intptr_t nArgValues, + MlirValue const *argValues) { + SmallVector args; + ArrayRef unwrappedArgs = unwrapList(nArgValues, argValues, args); + unwrap(rewriter)->mergeBlocks(unwrap(source), unwrap(dest), unwrappedArgs); +} + +void mlirRewriterBaseMoveOpBefore(MlirRewriterBase rewriter, MlirOperation op, + MlirOperation existingOp) { + unwrap(rewriter)->moveOpBefore(unwrap(op), unwrap(existingOp)); +} + +void mlirRewriterBaseMoveOpAfter(MlirRewriterBase rewriter, MlirOperation op, + MlirOperation existingOp) { + unwrap(rewriter)->moveOpAfter(unwrap(op), unwrap(existingOp)); +} + +void mlirRewriterBaseMoveBlockBefore(MlirRewriterBase rewriter, MlirBlock block, + MlirBlock existingBlock) { + unwrap(rewriter)->moveBlockBefore(unwrap(block), unwrap(existingBlock)); +} + +void mlirRewriterBaseStartOpModification(MlirRewriterBase rewriter, + MlirOperation op) { + unwrap(rewriter)->startOpModification(unwrap(op)); +} + +void mlirRewriterBaseFinalizeOpModification(MlirRewriterBase rewriter, + MlirOperation op) { + unwrap(rewriter)->finalizeOpModification(unwrap(op)); +} + +void mlirRewriterBaseCancelOpModification(MlirRewriterBase rewriter, + MlirOperation op) { + unwrap(rewriter)->cancelOpModification(unwrap(op)); +} + +void mlirRewriterBaseReplaceAllUsesWith(MlirRewriterBase rewriter, + MlirValue from, MlirValue to) { + unwrap(rewriter)->replaceAllUsesWith(unwrap(from), unwrap(to)); +} + +void mlirRewriterBaseReplaceAllValueRangeUsesWith(MlirRewriterBase rewriter, + intptr_t nValues, + MlirValue const *from, + MlirValue const *to) { + SmallVector fromVals; + ArrayRef unwrappedFromVals = unwrapList(nValues, from, fromVals); + SmallVector toVals; + ArrayRef unwrappedToVals = unwrapList(nValues, to, toVals); + unwrap(rewriter)->replaceAllUsesWith(unwrappedFromVals, unwrappedToVals); +} + +void mlirRewriterBaseReplaceAllOpUsesWithValueRange(MlirRewriterBase rewriter, + MlirOperation from, + intptr_t nTo, + MlirValue const *to) { + SmallVector toVals; + ArrayRef unwrappedToVals = unwrapList(nTo, to, toVals); + unwrap(rewriter)->replaceAllOpUsesWith(unwrap(from), unwrappedToVals); +} + +void mlirRewriterBaseReplaceAllOpUsesWithOperation(MlirRewriterBase rewriter, + MlirOperation from, + MlirOperation to) { + unwrap(rewriter)->replaceAllOpUsesWith(unwrap(from), unwrap(to)); +} + +void mlirRewriterBaseReplaceOpUsesWithinBlock(MlirRewriterBase rewriter, + MlirOperation op, + intptr_t nNewValues, + MlirValue const *newValues, + MlirBlock block) { + SmallVector vals; + ArrayRef unwrappedVals = unwrapList(nNewValues, newValues, vals); + unwrap(rewriter)->replaceOpUsesWithinBlock(unwrap(op), unwrappedVals, + unwrap(block)); +} + +void mlirRewriterBaseReplaceAllUsesExcept(MlirRewriterBase rewriter, + MlirValue from, MlirValue to, + MlirOperation exceptedUser) { + unwrap(rewriter)->replaceAllUsesExcept(unwrap(from), unwrap(to), + unwrap(exceptedUser)); +} + +//===----------------------------------------------------------------------===// +/// IRRewriter API +//===----------------------------------------------------------------------===// + +MlirRewriterBase mlirIRRewriterCreate(MlirContext context) { + return wrap(new IRRewriter(unwrap(context))); +} + +MlirRewriterBase mlirIRRewriterCreateFromOp(MlirOperation op) { + return wrap(new IRRewriter(unwrap(op))); +} + +void mlirIRRewriterDestroy(MlirRewriterBase rewriter) { + delete static_cast(unwrap(rewriter)); +} + +//===----------------------------------------------------------------------===// +/// RewritePatternSet and FrozenRewritePatternSet API +//===----------------------------------------------------------------------===// + inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) { assert(module.ptr && "unexpected null module"); return *(static_cast(module.ptr)); @@ -54,6 +293,10 @@ mlirApplyPatternsAndFoldGreedily(MlirModule op, mlir::applyPatternsAndFoldGreedily(unwrap(op), *unwrap(patterns))); } +//===----------------------------------------------------------------------===// +/// PDLPatternModule API +//===----------------------------------------------------------------------===// + #if MLIR_ENABLE_PDL_IN_PATTERNMATCH inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) { assert(module.ptr && "unexpected null module"); diff --git a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp index 2aae39f51b9409..f9903071be0842 100644 --- a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp +++ b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp @@ -132,27 +132,30 @@ struct BufferizationToMemRefPass return; } - func::FuncOp helperFuncOp; + bufferization::DeallocHelperMap deallocHelperFuncMap; if (auto module = dyn_cast(getOperation())) { OpBuilder builder = OpBuilder::atBlockBegin(&module.getBodyRegion().front()); - SymbolTable symbolTable(module); // Build dealloc helper function if there are deallocs. getOperation()->walk([&](bufferization::DeallocOp deallocOp) { - if (deallocOp.getMemrefs().size() > 1) { - helperFuncOp = bufferization::buildDeallocationLibraryFunction( - builder, getOperation()->getLoc(), symbolTable); - return WalkResult::interrupt(); + Operation *symtableOp = + deallocOp->getParentWithTrait(); + if (deallocOp.getMemrefs().size() > 1 && + !deallocHelperFuncMap.contains(symtableOp)) { + SymbolTable symbolTable(symtableOp); + func::FuncOp helperFuncOp = + bufferization::buildDeallocationLibraryFunction( + builder, getOperation()->getLoc(), symbolTable); + deallocHelperFuncMap[symtableOp] = helperFuncOp; } - return WalkResult::advance(); }); } RewritePatternSet patterns(&getContext()); patterns.add(patterns.getContext()); - bufferization::populateBufferizationDeallocLoweringPattern(patterns, - helperFuncOp); + bufferization::populateBufferizationDeallocLoweringPattern( + patterns, deallocHelperFuncMap); ConversionTarget target(getContext()); target.addLegalDialect std::optional { @@ -164,12 +166,18 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx, // memref descriptor cannot be built just from a bare pointer. return std::nullopt; } - return UnrankedMemRefDescriptor::pack(builder, loc, *this, resultType, - inputs); + Value desc = UnrankedMemRefDescriptor::pack(builder, loc, *this, + resultType, inputs); + // An argument materialization must return a value of type + // `resultType`, so insert a cast from the memref descriptor type + // (!llvm.struct) to the original memref type. + return builder.create(loc, resultType, desc) + .getResult(0); }); addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType, ValueRange inputs, Location loc) -> std::optional { + Value desc; if (inputs.size() == 1) { // This is a bare pointer. We allow bare pointers only for function entry // blocks. @@ -180,10 +188,16 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx, if (!block->isEntryBlock() || !isa(block->getParentOp())) return std::nullopt; - return MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType, + desc = MemRefDescriptor::fromStaticShape(builder, loc, *this, resultType, inputs[0]); + } else { + desc = MemRefDescriptor::pack(builder, loc, *this, resultType, inputs); } - return MemRefDescriptor::pack(builder, loc, *this, resultType, inputs); + // An argument materialization must return a value of type `resultType`, + // so insert a cast from the memref descriptor type (!llvm.struct) to the + // original memref type. + return builder.create(loc, resultType, desc) + .getResult(0); }); // Add generic source and target materializations to handle cases where // non-LLVM types persist after an LLVM conversion. diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp index c9363295ec32f5..a4390447532a50 100644 --- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp +++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp @@ -521,7 +521,7 @@ struct VectorShuffleOpConvert final LogicalResult matchAndRewrite(vector::ShuffleOp shuffleOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto oldResultType = shuffleOp.getResultVectorType(); + VectorType oldResultType = shuffleOp.getResultVectorType(); Type newResultType = getTypeConverter()->convertType(oldResultType); if (!newResultType) return rewriter.notifyMatchFailure(shuffleOp, @@ -532,20 +532,22 @@ struct VectorShuffleOpConvert final return cast(attr).getValue().getZExtValue(); }); - auto oldV1Type = shuffleOp.getV1VectorType(); - auto oldV2Type = shuffleOp.getV2VectorType(); + VectorType oldV1Type = shuffleOp.getV1VectorType(); + VectorType oldV2Type = shuffleOp.getV2VectorType(); - // When both operands are SPIR-V vectors, emit a SPIR-V shuffle. - if (oldV1Type.getNumElements() > 1 && oldV2Type.getNumElements() > 1) { + // When both operands and the result are SPIR-V vectors, emit a SPIR-V + // shuffle. + if (oldV1Type.getNumElements() > 1 && oldV2Type.getNumElements() > 1 && + oldResultType.getNumElements() > 1) { rewriter.replaceOpWithNewOp( shuffleOp, newResultType, adaptor.getV1(), adaptor.getV2(), rewriter.getI32ArrayAttr(mask)); return success(); } - // When at least one of the operands becomes a scalar after type conversion - // for SPIR-V, extract all the required elements and construct the result - // vector. + // When at least one of the operands or the result becomes a scalar after + // type conversion for SPIR-V, extract all the required elements and + // construct the result vector. auto getElementAtIdx = [&rewriter, loc = shuffleOp.getLoc()]( Value scalarOrVec, int32_t idx) -> Value { if (auto vecTy = dyn_cast(scalarOrVec.getType())) @@ -569,9 +571,14 @@ struct VectorShuffleOpConvert final newOperand = getElementAtIdx(vec, elementIdx); } + // Handle the scalar result corner case. + if (newOperands.size() == 1) { + rewriter.replaceOp(shuffleOp, newOperands.front()); + return success(); + } + rewriter.replaceOpWithNewOp( shuffleOp, newResultType, newOperands); - return success(); } }; diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp index 71e9648a5e00fa..6bb8dfecba0ec5 100644 --- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -1223,8 +1223,19 @@ static Operation *vectorizeAffineLoad(AffineLoadOp loadOp, LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: "); LLVM_DEBUG(permutationMap.print(dbgs())); + // Make sure that the in_bounds attribute corresponding to a broadcast dim + // is set to `true` - that's required by the xfer Op. + // FIXME: We're not veryfying whether the corresponding access is in bounds. + // TODO: Use masking instead. + SmallVector broadcastedDims = permutationMap.getBroadcastDims(); + SmallVector inBounds(vectorType.getRank(), false); + + for (auto idx : broadcastedDims) + inBounds[idx] = true; + auto transfer = state.builder.create( - loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap); + loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap, + inBounds); // Register replacement for future uses in the scope. state.registerOpVectorReplacement(loadOp, transfer); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp index 7fb46918ab1e8d..9e2c91bad7bfdd 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp @@ -300,8 +300,9 @@ class DeallocOpConversion MemRefType::get({ShapedType::kDynamic}, rewriter.getI1Type()), retainCondsMemref); + Operation *symtableOp = op->getParentWithTrait(); rewriter.create( - op.getLoc(), deallocHelperFunc, + op.getLoc(), deallocHelperFuncMap.lookup(symtableOp), SmallVector{castedDeallocMemref, castedRetainMemref, castedCondsMemref, castedDeallocCondsMemref, castedRetainCondsMemref}); @@ -338,9 +339,11 @@ class DeallocOpConversion } public: - DeallocOpConversion(MLIRContext *context, func::FuncOp deallocHelperFunc) + DeallocOpConversion( + MLIRContext *context, + const bufferization::DeallocHelperMap &deallocHelperFuncMap) : OpConversionPattern(context), - deallocHelperFunc(deallocHelperFunc) {} + deallocHelperFuncMap(deallocHelperFuncMap) {} LogicalResult matchAndRewrite(bufferization::DeallocOp op, OpAdaptor adaptor, @@ -360,7 +363,8 @@ class DeallocOpConversion if (adaptor.getMemrefs().size() == 1) return rewriteOneMemrefMultipleRetainCase(op, adaptor, rewriter); - if (!deallocHelperFunc) + Operation *symtableOp = op->getParentWithTrait(); + if (!deallocHelperFuncMap.contains(symtableOp)) return op->emitError( "library function required for generic lowering, but cannot be " "automatically inserted when operating on functions"); @@ -369,7 +373,7 @@ class DeallocOpConversion } private: - func::FuncOp deallocHelperFunc; + const bufferization::DeallocHelperMap &deallocHelperFuncMap; }; } // namespace @@ -385,26 +389,29 @@ struct LowerDeallocationsPass return; } - func::FuncOp helperFuncOp; + bufferization::DeallocHelperMap deallocHelperFuncMap; if (auto module = dyn_cast(getOperation())) { OpBuilder builder = OpBuilder::atBlockBegin(&module.getBodyRegion().front()); - SymbolTable symbolTable(module); // Build dealloc helper function if there are deallocs. getOperation()->walk([&](bufferization::DeallocOp deallocOp) { - if (deallocOp.getMemrefs().size() > 1) { - helperFuncOp = bufferization::buildDeallocationLibraryFunction( - builder, getOperation()->getLoc(), symbolTable); - return WalkResult::interrupt(); + Operation *symtableOp = + deallocOp->getParentWithTrait(); + if (deallocOp.getMemrefs().size() > 1 && + !deallocHelperFuncMap.contains(symtableOp)) { + SymbolTable symbolTable(symtableOp); + func::FuncOp helperFuncOp = + bufferization::buildDeallocationLibraryFunction( + builder, getOperation()->getLoc(), symbolTable); + deallocHelperFuncMap[symtableOp] = helperFuncOp; } - return WalkResult::advance(); }); } RewritePatternSet patterns(&getContext()); - bufferization::populateBufferizationDeallocLoweringPattern(patterns, - helperFuncOp); + bufferization::populateBufferizationDeallocLoweringPattern( + patterns, deallocHelperFuncMap); ConversionTarget target(getContext()); target.addLegalDialect(patterns.getContext(), deallocLibraryFunc); + RewritePatternSet &patterns, + const bufferization::DeallocHelperMap &deallocHelperFuncMap) { + patterns.add(patterns.getContext(), + deallocHelperFuncMap); } std::unique_ptr mlir::bufferization::createLowerDeallocationsPass() { diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index a4c0508d0d8fae..68ee915cca3f42 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1343,8 +1343,17 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, SmallVector indices(linalgOp.getShape(opOperand).size(), zero); + // Make sure that the in_bounds attribute corresponding to a broadcast dim + // is `true` + SmallVector broadcastedDims = readMap.getBroadcastDims(); + SmallVector inBounds(readType.getRank(), false); + + for (auto idx : broadcastedDims) + inBounds[idx] = true; + Operation *read = rewriter.create( - loc, readType, opOperand->get(), indices, readMap); + loc, readType, opOperand->get(), indices, readMap, + ArrayRef(inBounds)); read = state.maskOperation(rewriter, read, linalgOp, maskingMap); Value readValue = read->getResult(0); @@ -2681,11 +2690,12 @@ LogicalResult LinalgCopyVTRForwardingPattern::matchAndRewrite( // The `masked` attribute is only valid on this padded buffer. // When forwarding to vector.transfer_read, the attribute must be reset // conservatively. + auto vectorType = xferOp.getVectorType(); Value res = rewriter.create( - xferOp.getLoc(), xferOp.getVectorType(), in, xferOp.getIndices(), + xferOp.getLoc(), vectorType, in, xferOp.getIndices(), xferOp.getPermutationMapAttr(), xferOp.getPadding(), xferOp.getMask(), - // in_bounds is explicitly reset - /*inBoundsAttr=*/ArrayAttr()); + rewriter.getBoolArrayAttr( + SmallVector(vectorType.getRank(), false))); if (maybeFillOp) rewriter.eraseOp(maybeFillOp); @@ -2739,11 +2749,12 @@ LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite( // The `masked` attribute is only valid on this padded buffer. // When forwarding to vector.transfer_write, the attribute must be reset // conservatively. + auto vector = xferOp.getVector(); rewriter.create( - xferOp.getLoc(), xferOp.getVector(), out, xferOp.getIndices(), + xferOp.getLoc(), vector, out, xferOp.getIndices(), xferOp.getPermutationMapAttr(), xferOp.getMask(), - // in_bounds is explicitly reset - /*inBoundsAttr=*/ArrayAttr()); + rewriter.getBoolArrayAttr( + SmallVector(vector.getType().getRank(), false))); rewriter.eraseOp(copyOp); rewriter.eraseOp(xferOp); diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index a7a0af231af33e..f5ec5a476ad8fa 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include "mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc" #include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc" diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 907d7f794593db..4de8dacc0edbf3 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -4297,33 +4297,42 @@ void IndexSwitchOp::getRegionInvocationBounds( bounds.emplace_back(/*lb=*/0, /*ub=*/i == liveIndex); } -LogicalResult IndexSwitchOp::fold(FoldAdaptor adaptor, - SmallVectorImpl &results) { - std::optional maybeCst = getConstantIntValue(getArg()); - if (!maybeCst.has_value()) - return failure(); - int64_t cst = *maybeCst; - int64_t caseIdx, e = getNumCases(); - for (caseIdx = 0; caseIdx < e; ++caseIdx) { - if (cst == getCases()[caseIdx]) - break; - } +struct FoldConstantCase : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - Region &r = (caseIdx < getNumCases()) ? getCaseRegions()[caseIdx] - : getDefaultRegion(); - Block &source = r.front(); - results.assign(source.getTerminator()->getOperands().begin(), - source.getTerminator()->getOperands().end()); + LogicalResult matchAndRewrite(scf::IndexSwitchOp op, + PatternRewriter &rewriter) const override { + // If `op.getArg()` is a constant, select the region that matches with + // the constant value. Use the default region if no matche is found. + std::optional maybeCst = getConstantIntValue(op.getArg()); + if (!maybeCst.has_value()) + return failure(); + int64_t cst = *maybeCst; + int64_t caseIdx, e = op.getNumCases(); + for (caseIdx = 0; caseIdx < e; ++caseIdx) { + if (cst == op.getCases()[caseIdx]) + break; + } - Block *pDestination = (*this)->getBlock(); - if (!pDestination) - return failure(); - Block::iterator insertionPoint = (*this)->getIterator(); - pDestination->getOperations().splice(insertionPoint, source.getOperations(), - source.getOperations().begin(), - std::prev(source.getOperations().end())); + Region &r = (caseIdx < op.getNumCases()) ? op.getCaseRegions()[caseIdx] + : op.getDefaultRegion(); + Block &source = r.front(); + Operation *terminator = source.getTerminator(); + SmallVector results = terminator->getOperands(); - return success(); + rewriter.inlineBlockBefore(&source, op); + rewriter.eraseOp(terminator); + // Repalce the operation with a potentially empty list of results. + // Fold mechanism doesn't support the case where the result list is empty. + rewriter.replaceOp(op, results); + + return success(); + } +}; + +void IndexSwitchOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SCF/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/SCF/TransformOps/CMakeLists.txt index 1d6f9ebd153f0b..06bccab80e7d80 100644 --- a/mlir/lib/Dialect/SCF/TransformOps/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/TransformOps/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransformOps MLIRIR MLIRLoopLikeInterface MLIRSCFDialect + MLIRSCFToControlFlow MLIRSCFTransforms MLIRSCFUtils MLIRTransformDialect diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp index 56ff2709a589ec..c4a55c302d0a3e 100644 --- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp +++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" + +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -49,6 +51,11 @@ void transform::ApplySCFStructuralConversionPatternsOp:: conversionTarget); } +void transform::ApplySCFToControlFlowPatternsOp::populatePatterns( + TypeConverter &typeConverter, RewritePatternSet &patterns) { + populateSCFToControlFlowConversionPatterns(patterns); +} + //===----------------------------------------------------------------------===// // ForallToForOp //===----------------------------------------------------------------------===// @@ -261,8 +268,10 @@ loopScheduling(scf::ForOp forOp, return 1; }; - std::optional ubConstant = getConstantIntValue(forOp.getUpperBound()); - std::optional lbConstant = getConstantIntValue(forOp.getLowerBound()); + std::optional ubConstant = + getConstantIntValue(forOp.getUpperBound()); + std::optional lbConstant = + getConstantIntValue(forOp.getLowerBound()); DenseMap opCycles; std::map> wrappedSchedule; for (Operation &op : forOp.getBody()->getOperations()) { diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index e4f387d40ced2e..d2ab4cabb32bf1 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -42,7 +42,7 @@ mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, while (sourceDim < sourceShape.size()) { unsigned targetDim = reassociationMap.size(); // If we have mapped all the target dimensions stop and handle the remaining - // tail of size-1 dimensions explictly. + // tail of size-1 dimensions explicitly. if (targetDim == targetShape.size()) break; diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 55bace2e35f444..df3a59ed80ad48 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -3818,7 +3818,8 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result, auto permutationMapAttr = AffineMapAttr::get(permutationMap); auto inBoundsAttr = (inBounds && !inBounds.value().empty()) ? builder.getBoolArrayAttr(inBounds.value()) - : ArrayAttr(); + : builder.getBoolArrayAttr( + SmallVector(vectorType.getRank(), false)); build(builder, result, vectorType, source, indices, permutationMapAttr, inBoundsAttr); } @@ -3833,7 +3834,8 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result, auto permutationMapAttr = AffineMapAttr::get(permutationMap); auto inBoundsAttr = (inBounds && !inBounds.value().empty()) ? builder.getBoolArrayAttr(inBounds.value()) - : ArrayAttr(); + : builder.getBoolArrayAttr( + SmallVector(vectorType.getRank(), false)); build(builder, result, vectorType, source, indices, permutationMapAttr, padding, /*mask=*/Value(), inBoundsAttr); @@ -3951,17 +3953,15 @@ verifyTransferOp(VectorTransferOpInterface op, ShapedType shapedType, << inferredMaskType << ") and mask operand type (" << maskType << ") don't match"; - if (inBounds) { - if (permutationMap.getNumResults() != static_cast(inBounds.size())) - return op->emitOpError("expects the optional in_bounds attr of same rank " - "as permutation_map results: ") - << AffineMapAttr::get(permutationMap) - << " vs inBounds of size: " << inBounds.size(); - for (unsigned int i = 0; i < permutationMap.getNumResults(); ++i) - if (isa(permutationMap.getResult(i)) && - !llvm::cast(inBounds.getValue()[i]).getValue()) - return op->emitOpError("requires broadcast dimensions to be in-bounds"); - } + if (permutationMap.getNumResults() != static_cast(inBounds.size())) + return op->emitOpError("expects the in_bounds attr of same rank " + "as permutation_map results: ") + << AffineMapAttr::get(permutationMap) + << " vs inBounds of size: " << inBounds.size(); + for (unsigned int i = 0, e = permutationMap.getNumResults(); i < e; ++i) + if (isa(permutationMap.getResult(i)) && + !llvm::cast(inBounds.getValue()[i]).getValue()) + return op->emitOpError("requires broadcast dimensions to be in-bounds"); return success(); } @@ -4037,6 +4037,13 @@ ParseResult TransferReadOp::parse(OpAsmParser &parser, OperationState &result) { } else { permMap = llvm::cast(permMapAttr).getValue(); } + auto inBoundsAttrName = TransferReadOp::getInBoundsAttrName(result.name); + Attribute inBoundsAttr = result.attributes.get(inBoundsAttrName); + if (!inBoundsAttr) { + result.addAttribute(inBoundsAttrName, + builder.getBoolArrayAttr( + SmallVector(permMap.getNumResults(), false))); + } if (parser.resolveOperand(sourceInfo, shapedType, result.operands) || parser.resolveOperands(indexInfo, indexType, result.operands) || parser.resolveOperand(paddingInfo, shapedType.getElementType(), @@ -4081,8 +4088,7 @@ LogicalResult TransferReadOp::verify() { if (failed(verifyTransferOp(cast(getOperation()), shapedType, vectorType, maskType, - inferredMaskType, permutationMap, - getInBounds() ? *getInBounds() : ArrayAttr()))) + inferredMaskType, permutationMap, getInBounds()))) return failure(); if (auto sourceVectorElementType = @@ -4355,9 +4361,11 @@ void TransferWriteOp::build(OpBuilder &builder, OperationState &result, AffineMap permutationMap, std::optional> inBounds) { auto permutationMapAttr = AffineMapAttr::get(permutationMap); - auto inBoundsAttr = (inBounds && !inBounds.value().empty()) - ? builder.getBoolArrayAttr(inBounds.value()) - : ArrayAttr(); + auto inBoundsAttr = + (inBounds && !inBounds.value().empty()) + ? builder.getBoolArrayAttr(inBounds.value()) + : builder.getBoolArrayAttr(SmallVector( + llvm::cast(vector.getType()).getRank(), false)); build(builder, result, vector, dest, indices, permutationMapAttr, /*mask=*/Value(), inBoundsAttr); } @@ -4410,6 +4418,13 @@ ParseResult TransferWriteOp::parse(OpAsmParser &parser, } else { permMap = llvm::cast(permMapAttr).getValue(); } + auto inBoundsAttrName = TransferWriteOp::getInBoundsAttrName(result.name); + Attribute inBoundsAttr = result.attributes.get(inBoundsAttrName); + if (!inBoundsAttr) { + result.addAttribute(inBoundsAttrName, + builder.getBoolArrayAttr( + SmallVector(permMap.getNumResults(), false))); + } if (parser.resolveOperand(vectorInfo, vectorType, result.operands) || parser.resolveOperand(sourceInfo, shapedType, result.operands) || parser.resolveOperands(indexInfo, indexType, result.operands)) @@ -4463,8 +4478,7 @@ LogicalResult TransferWriteOp::verify() { if (failed(verifyTransferOp(cast(getOperation()), shapedType, vectorType, maskType, - inferredMaskType, permutationMap, - getInBounds() ? *getInBounds() : ArrayAttr()))) + inferredMaskType, permutationMap, getInBounds()))) return failure(); return verifyPermutationMap(permutationMap, diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp index f53bb5157eb37b..dfeb7bc53adad7 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp @@ -224,7 +224,7 @@ struct MaskedTransferReadOpPattern rewriter.replaceOpWithNewOp( maskingOp.getOperation(), readOp.getVectorType(), readOp.getSource(), readOp.getIndices(), readOp.getPermutationMap(), readOp.getPadding(), - maskingOp.getMask(), readOp.getInBounds().value_or(ArrayAttr())); + maskingOp.getMask(), readOp.getInBounds()); return success(); } }; @@ -246,7 +246,7 @@ struct MaskedTransferWriteOpPattern rewriter.replaceOpWithNewOp( maskingOp.getOperation(), resultType, writeOp.getVector(), writeOp.getSource(), writeOp.getIndices(), writeOp.getPermutationMap(), - maskingOp.getMask(), writeOp.getInBounds().value_or(ArrayAttr())); + maskingOp.getMask(), writeOp.getInBounds()); return success(); } }; diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp index c31c51489ecc96..b3c6dec47f6be4 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp @@ -133,9 +133,7 @@ struct TransferReadPermutationLowering // Transpose in_bounds attribute. ArrayAttr newInBoundsAttr = - op.getInBounds() ? inverseTransposeInBoundsAttr( - rewriter, op.getInBounds().value(), permutation) - : ArrayAttr(); + inverseTransposeInBoundsAttr(rewriter, op.getInBounds(), permutation); // Generate new transfer_read operation. VectorType newReadType = VectorType::get( @@ -208,9 +206,7 @@ struct TransferWritePermutationLowering // Transpose in_bounds attribute. ArrayAttr newInBoundsAttr = - op.getInBounds() ? inverseTransposeInBoundsAttr( - rewriter, op.getInBounds().value(), permutation) - : ArrayAttr(); + inverseTransposeInBoundsAttr(rewriter, op.getInBounds(), permutation); // Generate new transfer_write operation. Value newVec = rewriter.create( diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp index bfb7c4849356eb..75cc01ee9a146a 100644 --- a/mlir/lib/IR/AffineExpr.cpp +++ b/mlir/lib/IR/AffineExpr.cpp @@ -751,8 +751,10 @@ static AffineExpr simplifyAdd(AffineExpr lhs, AffineExpr rhs) { } // Process lrhs, which is 'expr floordiv c'. + // expr + (expr // c * -c) = expr % c AffineBinaryOpExpr lrBinOpExpr = dyn_cast(lrhs); - if (!lrBinOpExpr || lrBinOpExpr.getKind() != AffineExprKind::FloorDiv) + if (!lrBinOpExpr || rhs.getKind() != AffineExprKind::Mul || + lrBinOpExpr.getKind() != AffineExprKind::FloorDiv) return nullptr; llrhs = lrBinOpExpr.getLHS(); diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp index 62f595299afe29..859fb8ebc10e8c 100644 --- a/mlir/lib/IR/AffineMap.cpp +++ b/mlir/lib/IR/AffineMap.cpp @@ -158,6 +158,19 @@ bool AffineMap::isMinorIdentity() const { getMinorIdentityMap(getNumDims(), getNumResults(), getContext()); } +SmallVector AffineMap::getBroadcastDims() const { + SmallVector broadcastedDims; + for (const auto &[resIdx, expr] : llvm::enumerate(getResults())) { + if (auto constExpr = dyn_cast(expr)) { + if (constExpr.getValue() != 0) + continue; + broadcastedDims.push_back(resIdx); + } + } + + return broadcastedDims; +} + /// Returns true if this affine map is a minor identity up to broadcasted /// dimensions which are indicated by value 0 in the result. bool AffineMap::isMinorIdentityWithBroadcasting( diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp index 047d214b751f19..70d6bcd76285aa 100644 --- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp +++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp @@ -150,11 +150,6 @@ LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) { return failure(); } - // Get the ISA version. - StringRef isaVersion = - llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip)); - isaVersion.consume_front("gfx"); - // Helper function for adding a library. auto addLib = [&](const Twine &lib) -> bool { auto baseSize = path.size(); @@ -175,9 +170,7 @@ LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) { if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) || (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) || (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) || - (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")) || - (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl)) && - addLib("oclc_isa_version_" + isaVersion + ".bc"))) + (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc"))) return failure(); return success(); } @@ -270,6 +263,14 @@ void SerializeGPUModuleBase::addControlVariables( // Add ocml or ockl related control variables. if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) { addControlVariable("__oclc_wavefrontsize64", wave64, 8); + + // Get the ISA version. + llvm::AMDGPU::IsaVersion isaVersion = llvm::AMDGPU::getIsaVersion(chip); + // Add the ISA control variable. + addControlVariable("__oclc_ISA_version", + isaVersion.Minor + 100 * isaVersion.Stepping + + 1000 * isaVersion.Major, + 32); int abi = 500; abiVer.getAsInteger(0, abi); addControlVariable("__oclc_ABI_version", abi, 32); diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h index 4a2bf35c160e14..0e040891ba6c02 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.h +++ b/mlir/lib/Target/LLVMIR/DebugImporter.h @@ -18,6 +18,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "mlir/Support/CyclicReplacerCache.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/DebugInfoMetadata.h" namespace mlir { diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 9915576bbc458b..5bc3dd680d02dd 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1688,6 +1688,7 @@ static constexpr std::array kExplicitAttributes{ StringLiteral("noinline"), StringLiteral("optnone"), StringLiteral("target-features"), + StringLiteral("tune-cpu"), StringLiteral("unsafe-fp-math"), StringLiteral("vscale_range"), }; @@ -1804,6 +1805,10 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func, attr.isStringAttribute()) funcOp.setTargetCpuAttr(StringAttr::get(context, attr.getValueAsString())); + if (llvm::Attribute attr = func->getFnAttribute("tune-cpu"); + attr.isStringAttribute()) + funcOp.setTuneCpuAttr(StringAttr::get(context, attr.getValueAsString())); + if (llvm::Attribute attr = func->getFnAttribute("target-features"); attr.isStringAttribute()) funcOp.setTargetFeaturesAttr( diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index c96ed939e4bf26..ef226dd3a77d5b 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1331,6 +1331,9 @@ LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) { if (auto targetCpu = func.getTargetCpu()) llvmFunc->addFnAttr("target-cpu", *targetCpu); + if (auto tuneCpu = func.getTuneCpu()) + llvmFunc->addFnAttr("tune-cpu", *tuneCpu); + if (auto targetFeatures = func.getTargetFeatures()) llvmFunc->addFnAttr("target-features", targetFeatures->getFeaturesString()); diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index e6c0ee2ab29490..1e0afee2373a91 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -707,10 +707,9 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { UnresolvedMaterializationRewrite( ConversionPatternRewriterImpl &rewriterImpl, UnrealizedConversionCastOp op, const TypeConverter *converter = nullptr, - MaterializationKind kind = MaterializationKind::Target, - Type origOutputType = nullptr) + MaterializationKind kind = MaterializationKind::Target) : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op), - converterAndKind(converter, kind), origOutputType(origOutputType) {} + converterAndKind(converter, kind) {} static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::UnresolvedMaterialization; @@ -734,17 +733,11 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { return converterAndKind.getInt(); } - /// Return the original illegal output type of the input values. - Type getOrigOutputType() const { return origOutputType; } - private: /// The corresponding type converter to use when resolving this /// materialization, and the kind of this materialization. llvm::PointerIntPair converterAndKind; - - /// The original output type. This is only used for argument conversions. - Type origOutputType; }; } // namespace @@ -860,12 +853,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { Block *insertBlock, Block::iterator insertPt, Location loc, ValueRange inputs, Type outputType, - Type origOutputType, const TypeConverter *converter); Value buildUnresolvedArgumentMaterialization(Block *block, Location loc, ValueRange inputs, - Type origOutputType, Type outputType, const TypeConverter *converter); @@ -1388,20 +1379,28 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( if (replArgs.size() == 1 && (!converter || replArgs[0].getType() == origArg.getType())) { newArg = replArgs.front(); + mapping.map(origArg, newArg); } else { - Type origOutputType = origArg.getType(); - - // Legalize the argument output type. - Type outputType = origOutputType; - if (Type legalOutputType = converter->convertType(outputType)) - outputType = legalOutputType; - - newArg = buildUnresolvedArgumentMaterialization( - newBlock, origArg.getLoc(), replArgs, origOutputType, outputType, - converter); + // Build argument materialization: new block arguments -> old block + // argument type. + Value argMat = buildUnresolvedArgumentMaterialization( + newBlock, origArg.getLoc(), replArgs, origArg.getType(), converter); + mapping.map(origArg, argMat); + + // Build target materialization: old block argument type -> legal type. + // Note: This function returns an "empty" type if no valid conversion to + // a legal type exists. In that case, we continue the conversion with the + // original block argument type. + Type legalOutputType = converter->convertType(origArg.getType()); + if (legalOutputType && legalOutputType != origArg.getType()) { + newArg = buildUnresolvedTargetMaterialization( + origArg.getLoc(), argMat, legalOutputType, converter); + mapping.map(argMat, newArg); + } else { + newArg = argMat; + } } - mapping.map(origArg, newArg); appendRewrite(block, origArg); argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg); } @@ -1424,7 +1423,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( /// of input operands. Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization( MaterializationKind kind, Block *insertBlock, Block::iterator insertPt, - Location loc, ValueRange inputs, Type outputType, Type origOutputType, + Location loc, ValueRange inputs, Type outputType, const TypeConverter *converter) { // Avoid materializing an unnecessary cast. if (inputs.size() == 1 && inputs.front().getType() == outputType) @@ -1435,16 +1434,15 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization( OpBuilder builder(insertBlock, insertPt); auto convertOp = builder.create(loc, outputType, inputs); - appendRewrite(convertOp, converter, kind, - origOutputType); + appendRewrite(convertOp, converter, kind); return convertOp.getResult(0); } Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization( - Block *block, Location loc, ValueRange inputs, Type origOutputType, - Type outputType, const TypeConverter *converter) { + Block *block, Location loc, ValueRange inputs, Type outputType, + const TypeConverter *converter) { return buildUnresolvedMaterialization(MaterializationKind::Argument, block, block->begin(), loc, inputs, outputType, - origOutputType, converter); + converter); } Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization( Location loc, Value input, Type outputType, @@ -1456,7 +1454,7 @@ Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization( return buildUnresolvedMaterialization(MaterializationKind::Target, insertBlock, insertPt, loc, input, - outputType, outputType, converter); + outputType, converter); } //===----------------------------------------------------------------------===// @@ -2672,6 +2670,9 @@ static void computeNecessaryMaterializations( ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping, SetVector &necessaryMaterializations) { + // Helper function to check if the given value or a not yet materialized + // replacement of the given value is live. + // Note: `inverseMapping` maps from replaced values to original values. auto isLive = [&](Value value) { auto findFn = [&](Operation *user) { auto matIt = materializationOps.find(user); @@ -2679,12 +2680,18 @@ static void computeNecessaryMaterializations( return !necessaryMaterializations.count(matIt->second); return rewriterImpl.isOpIgnored(user); }; - // This value may be replacing another value that has a live user. - for (Value inv : inverseMapping.lookup(value)) - if (llvm::find_if_not(inv.getUsers(), findFn) != inv.user_end()) + // A worklist is needed because a value may have gone through a chain of + // replacements and each of the replaced values may have live users. + SmallVector worklist; + worklist.push_back(value); + while (!worklist.empty()) { + Value next = worklist.pop_back_val(); + if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end()) return true; - // Or have live users itself. - return llvm::find_if_not(value.getUsers(), findFn) != value.user_end(); + // This value may be replacing another value that has a live user. + llvm::append_range(worklist, inverseMapping.lookup(next)); + } + return false; }; llvm::unique_function lookupRemappedValue = @@ -2844,18 +2851,10 @@ static LogicalResult legalizeUnresolvedMaterialization( switch (mat.getMaterializationKind()) { case MaterializationKind::Argument: // Try to materialize an argument conversion. - // FIXME: The current argument materialization hook expects the original - // output type, even though it doesn't use that as the actual output type - // of the generated IR. The output type is just used as an indicator of - // the type of materialization to do. This behavior is really awkward in - // that it diverges from the behavior of the other hooks, and can be - // easily misunderstood. We should clean up the argument hooks to better - // represent the desired invariants we actually care about. newMaterialization = converter->materializeArgumentConversion( - rewriter, op->getLoc(), mat.getOrigOutputType(), inputOperands); + rewriter, op->getLoc(), outputType, inputOperands); if (newMaterialization) break; - // If an argument materialization failed, fallback to trying a target // materialization. [[fallthrough]]; @@ -2865,6 +2864,8 @@ static LogicalResult legalizeUnresolvedMaterialization( break; } if (newMaterialization) { + assert(newMaterialization.getType() == outputType && + "materialization callback produced value of incorrect type"); replaceMaterialization(rewriterImpl, opResult, newMaterialization, inverseMapping); return success(); diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py index 1a198fc5ec6f90..4f81a3874650d7 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py @@ -292,6 +292,7 @@ class UnaryFn: ceil = UnaryFnType("ceil") floor = UnaryFnType("floor") negf = UnaryFnType("negf") + reciprocal = UnaryFnType("reciprocal") round = UnaryFnType("round") sqrt = UnaryFnType("sqrt") rsqrt = UnaryFnType("rsqrt") diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index 3ceee8e3704451..67bde8f736ef46 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -108,6 +108,18 @@ def negf( O[None] = UnaryFn.negf(I[None]) +@linalg_structured_op(op_class_name="ReciprocalOp") +def reciprocal( + I=TensorDef(T1), + O=TensorDef(T1, output=True), +): + """Applies reciprocal(x) elementwise. + + No numeric casting is performed on the input operand. + """ + O[None] = UnaryFn.reciprocal(I[None]) + + @linalg_structured_op def round( I=TensorDef(T1), diff --git a/mlir/test/CAPI/CMakeLists.txt b/mlir/test/CAPI/CMakeLists.txt index ad312764b3e06c..e795672bce5d12 100644 --- a/mlir/test/CAPI/CMakeLists.txt +++ b/mlir/test/CAPI/CMakeLists.txt @@ -89,6 +89,15 @@ _add_capi_test_executable(mlir-capi-quant-test MLIRCAPIQuant ) +_add_capi_test_executable(mlir-capi-rewrite-test + rewrite.c + LINK_LIBS PRIVATE + MLIRCAPIIR + MLIRCAPIRegisterEverything + MLIRCAPITransforms +) + + _add_capi_test_executable(mlir-capi-transform-test transform.c LINK_LIBS PRIVATE diff --git a/mlir/test/CAPI/rewrite.c b/mlir/test/CAPI/rewrite.c new file mode 100644 index 00000000000000..a8b593eabb781d --- /dev/null +++ b/mlir/test/CAPI/rewrite.c @@ -0,0 +1,551 @@ +//===- rewrite.c - Test of the rewriting C API ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// RUN: mlir-capi-rewrite-test 2>&1 | FileCheck %s + +#include "mlir-c/Rewrite.h" +#include "mlir-c/BuiltinTypes.h" +#include "mlir-c/IR.h" + +#include +#include + +MlirOperation createOperationWithName(MlirContext ctx, const char *name) { + MlirStringRef nameRef = mlirStringRefCreateFromCString(name); + MlirLocation loc = mlirLocationUnknownGet(ctx); + MlirOperationState state = mlirOperationStateGet(nameRef, loc); + MlirType indexType = mlirIndexTypeGet(ctx); + mlirOperationStateAddResults(&state, 1, &indexType); + return mlirOperationCreate(&state); +} + +void testInsertionPoint(MlirContext ctx) { + // CHECK-LABEL: @testInsertionPoint + fprintf(stderr, "@testInsertionPoint\n"); + + const char *moduleString = "\"dialect.op1\"() : () -> ()\n"; + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + MlirOperation op1 = mlirBlockGetFirstOperation(body); + + // IRRewriter create + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + + // Insert before op + mlirRewriterBaseSetInsertionPointBefore(rewriter, op1); + MlirOperation op2 = createOperationWithName(ctx, "dialect.op2"); + mlirRewriterBaseInsert(rewriter, op2); + + // Insert after op + mlirRewriterBaseSetInsertionPointAfter(rewriter, op2); + MlirOperation op3 = createOperationWithName(ctx, "dialect.op3"); + mlirRewriterBaseInsert(rewriter, op3); + MlirValue op3Res = mlirOperationGetResult(op3, 0); + + // Insert after value + mlirRewriterBaseSetInsertionPointAfterValue(rewriter, op3Res); + MlirOperation op4 = createOperationWithName(ctx, "dialect.op4"); + mlirRewriterBaseInsert(rewriter, op4); + + // Insert at beginning of block + mlirRewriterBaseSetInsertionPointToStart(rewriter, body); + MlirOperation op5 = createOperationWithName(ctx, "dialect.op5"); + mlirRewriterBaseInsert(rewriter, op5); + + // Insert at end of block + mlirRewriterBaseSetInsertionPointToEnd(rewriter, body); + MlirOperation op6 = createOperationWithName(ctx, "dialect.op6"); + mlirRewriterBaseInsert(rewriter, op6); + + // Get insertion blocks + MlirBlock block1 = mlirRewriterBaseGetBlock(rewriter); + MlirBlock block2 = mlirRewriterBaseGetInsertionBlock(rewriter); + assert(body.ptr == block1.ptr); + assert(body.ptr == block2.ptr); + + // clang-format off + // CHECK-NEXT: module { + // CHECK-NEXT: %{{.*}} = "dialect.op5"() : () -> index + // CHECK-NEXT: %{{.*}} = "dialect.op2"() : () -> index + // CHECK-NEXT: %{{.*}} = "dialect.op3"() : () -> index + // CHECK-NEXT: %{{.*}} = "dialect.op4"() : () -> index + // CHECK-NEXT: "dialect.op1"() : () -> () + // CHECK-NEXT: %{{.*}} = "dialect.op6"() : () -> index + // CHECK-NEXT: } + // clang-format on + mlirOperationDump(op); + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testCreateBlock(MlirContext ctx) { + // CHECK-LABEL: @testCreateBlock + fprintf(stderr, "@testCreateBlock\n"); + + const char *moduleString = "\"dialect.op1\"() ({^bb0:}) : () -> ()\n" + "\"dialect.op2\"() ({^bb0:}) : () -> ()\n"; + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + MlirOperation op1 = mlirBlockGetFirstOperation(body); + MlirRegion region1 = mlirOperationGetRegion(op1, 0); + MlirBlock block1 = mlirRegionGetFirstBlock(region1); + + MlirOperation op2 = mlirOperationGetNextInBlock(op1); + MlirRegion region2 = mlirOperationGetRegion(op2, 0); + MlirBlock block2 = mlirRegionGetFirstBlock(region2); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + + // Create block before + MlirType indexType = mlirIndexTypeGet(ctx); + MlirLocation unknown = mlirLocationUnknownGet(ctx); + mlirRewriterBaseCreateBlockBefore(rewriter, block1, 1, &indexType, &unknown); + + mlirRewriterBaseSetInsertionPointToEnd(rewriter, body); + + // Clone operation + mlirRewriterBaseClone(rewriter, op1); + + // Clone without regions + mlirRewriterBaseCloneWithoutRegions(rewriter, op1); + + // Clone region before + mlirRewriterBaseCloneRegionBefore(rewriter, region1, block2); + + mlirOperationDump(op); + // clang-format off + // CHECK-NEXT: "builtin.module"() ({ + // CHECK-NEXT: "dialect.op1"() ({ + // CHECK-NEXT: ^{{.*}}(%{{.*}}: index): + // CHECK-NEXT: ^{{.*}}: + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op2"() ({ + // CHECK-NEXT: ^{{.*}}(%{{.*}}: index): + // CHECK-NEXT: ^{{.*}}: + // CHECK-NEXT: ^{{.*}}: + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op1"() ({ + // CHECK-NEXT: ^{{.*}}(%{{.*}}: index): + // CHECK-NEXT: ^{{.*}}: + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op1"() ({ + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: }) : () -> () + // clang-format on + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testInlineRegionBlock(MlirContext ctx) { + // CHECK-LABEL: @testInlineRegionBlock + fprintf(stderr, "@testInlineRegionBlock\n"); + + const char *moduleString = + "\"dialect.op1\"() ({\n" + " ^bb0(%arg0: index):\n" + " \"dialect.op1_in1\"(%arg0) [^bb1] : (index) -> ()\n" + " ^bb1():\n" + " \"dialect.op1_in2\"() : () -> ()\n" + "}) : () -> ()\n" + "\"dialect.op2\"() ({^bb0:}) : () -> ()\n" + "\"dialect.op3\"() ({\n" + " ^bb0(%arg0: index):\n" + " \"dialect.op3_in1\"(%arg0) : (index) -> ()\n" + " ^bb1():\n" + " %x = \"dialect.op3_in2\"() : () -> index\n" + " %y = \"dialect.op3_in3\"() : () -> index\n" + "}) : () -> ()\n" + "\"dialect.op4\"() ({\n" + " ^bb0():\n" + " \"dialect.op4_in1\"() : () -> index\n" + " ^bb1(%arg0: index):\n" + " \"dialect.op4_in2\"(%arg0) : (index) -> ()\n" + "}) : () -> ()\n"; + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + MlirOperation op1 = mlirBlockGetFirstOperation(body); + MlirRegion region1 = mlirOperationGetRegion(op1, 0); + + MlirOperation op2 = mlirOperationGetNextInBlock(op1); + MlirRegion region2 = mlirOperationGetRegion(op2, 0); + MlirBlock block2 = mlirRegionGetFirstBlock(region2); + + MlirOperation op3 = mlirOperationGetNextInBlock(op2); + MlirRegion region3 = mlirOperationGetRegion(op3, 0); + MlirBlock block3_1 = mlirRegionGetFirstBlock(region3); + MlirBlock block3_2 = mlirBlockGetNextInRegion(block3_1); + MlirOperation op3_in2 = mlirBlockGetFirstOperation(block3_2); + MlirValue op3_in2_res = mlirOperationGetResult(op3_in2, 0); + MlirOperation op3_in3 = mlirOperationGetNextInBlock(op3_in2); + + MlirOperation op4 = mlirOperationGetNextInBlock(op3); + MlirRegion region4 = mlirOperationGetRegion(op4, 0); + MlirBlock block4_1 = mlirRegionGetFirstBlock(region4); + MlirOperation op4_in1 = mlirBlockGetFirstOperation(block4_1); + MlirValue op4_in1_res = mlirOperationGetResult(op4_in1, 0); + MlirBlock block4_2 = mlirBlockGetNextInRegion(block4_1); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + + // Test these three functions + mlirRewriterBaseInlineRegionBefore(rewriter, region1, block2); + mlirRewriterBaseInlineBlockBefore(rewriter, block3_1, op3_in3, 1, + &op3_in2_res); + mlirRewriterBaseMergeBlocks(rewriter, block4_2, block4_1, 1, &op4_in1_res); + + mlirOperationDump(op); + // clang-format off + // CHECK-NEXT: "builtin.module"() ({ + // CHECK-NEXT: "dialect.op1"() ({ + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op2"() ({ + // CHECK-NEXT: ^{{.*}}(%{{.*}}: index): + // CHECK-NEXT: "dialect.op1_in1"(%{{.*}})[^[[bb:.*]]] : (index) -> () + // CHECK-NEXT: ^[[bb]]: + // CHECK-NEXT: "dialect.op1_in2"() : () -> () + // CHECK-NEXT: ^{{.*}}: // no predecessors + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op3"() ({ + // CHECK-NEXT: %{{.*}} = "dialect.op3_in2"() : () -> index + // CHECK-NEXT: "dialect.op3_in1"(%{{.*}}) : (index) -> () + // CHECK-NEXT: %{{.*}} = "dialect.op3_in3"() : () -> index + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op4"() ({ + // CHECK-NEXT: %{{.*}} = "dialect.op4_in1"() : () -> index + // CHECK-NEXT: "dialect.op4_in2"(%{{.*}}) : (index) -> () + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: }) : () -> () + // clang-format on + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testReplaceOp(MlirContext ctx) { + // CHECK-LABEL: @testReplaceOp + fprintf(stderr, "@testReplaceOp\n"); + + const char *moduleString = + "%x, %y, %z = \"dialect.create_values\"() : () -> (index, index, index)\n" + "%x_1, %y_1 = \"dialect.op1\"() : () -> (index, index)\n" + "\"dialect.use_op1\"(%x_1, %y_1) : (index, index) -> ()\n" + "%x_2, %y_2 = \"dialect.op2\"() : () -> (index, index)\n" + "%x_3, %y_3 = \"dialect.op3\"() : () -> (index, index)\n" + "\"dialect.use_op2\"(%x_2, %y_2) : (index, index) -> ()\n"; + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + // get a handle to all operations/values + MlirOperation createValues = mlirBlockGetFirstOperation(body); + MlirValue x = mlirOperationGetResult(createValues, 0); + MlirValue z = mlirOperationGetResult(createValues, 2); + MlirOperation op1 = mlirOperationGetNextInBlock(createValues); + MlirOperation useOp1 = mlirOperationGetNextInBlock(op1); + MlirOperation op2 = mlirOperationGetNextInBlock(useOp1); + MlirOperation op3 = mlirOperationGetNextInBlock(op2); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + + // Test replace op with values + MlirValue xz[2] = {x, z}; + mlirRewriterBaseReplaceOpWithValues(rewriter, op1, 2, xz); + + // Test replace op with op + mlirRewriterBaseReplaceOpWithOperation(rewriter, op2, op3); + + mlirOperationDump(op); + // clang-format off + // CHECK-NEXT: module { + // CHECK-NEXT: %[[res:.*]]:3 = "dialect.create_values"() : () -> (index, index, index) + // CHECK-NEXT: "dialect.use_op1"(%[[res]]#0, %[[res]]#2) : (index, index) -> () + // CHECK-NEXT: %[[res2:.*]]:2 = "dialect.op3"() : () -> (index, index) + // CHECK-NEXT: "dialect.use_op2"(%[[res2]]#0, %[[res2]]#1) : (index, index) -> () + // CHECK-NEXT: } + // clang-format on + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testErase(MlirContext ctx) { + // CHECK-LABEL: @testErase + fprintf(stderr, "@testErase\n"); + + const char *moduleString = "\"dialect.op_to_erase\"() : () -> ()\n" + "\"dialect.op2\"() ({\n" + "^bb0():\n" + " \"dialect.op2_nested\"() : () -> ()" + "^block_to_erase():\n" + " \"dialect.op2_nested\"() : () -> ()" + "^bb1():\n" + " \"dialect.op2_nested\"() : () -> ()" + "}) : () -> ()\n"; + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + // get a handle to all operations/values + MlirOperation opToErase = mlirBlockGetFirstOperation(body); + MlirOperation op2 = mlirOperationGetNextInBlock(opToErase); + MlirRegion op2Region = mlirOperationGetRegion(op2, 0); + MlirBlock bb0 = mlirRegionGetFirstBlock(op2Region); + MlirBlock blockToErase = mlirBlockGetNextInRegion(bb0); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + mlirRewriterBaseEraseOp(rewriter, opToErase); + mlirRewriterBaseEraseBlock(rewriter, blockToErase); + + mlirOperationDump(op); + // CHECK-NEXT: module { + // CHECK-NEXT: "dialect.op2"() ({ + // CHECK-NEXT: "dialect.op2_nested"() : () -> () + // CHECK-NEXT: ^{{.*}}: + // CHECK-NEXT: "dialect.op2_nested"() : () -> () + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: } + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testMove(MlirContext ctx) { + // CHECK-LABEL: @testMove + fprintf(stderr, "@testMove\n"); + + const char *moduleString = "\"dialect.op1\"() : () -> ()\n" + "\"dialect.op2\"() ({\n" + "^bb0(%arg0: index):\n" + " \"dialect.op2_1\"(%arg0) : (index) -> ()" + "^bb1(%arg1: index):\n" + " \"dialect.op2_2\"(%arg1) : (index) -> ()" + "}) : () -> ()\n" + "\"dialect.op3\"() : () -> ()\n" + "\"dialect.op4\"() : () -> ()\n"; + + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + // get a handle to all operations/values + MlirOperation op1 = mlirBlockGetFirstOperation(body); + MlirOperation op2 = mlirOperationGetNextInBlock(op1); + MlirOperation op3 = mlirOperationGetNextInBlock(op2); + MlirOperation op4 = mlirOperationGetNextInBlock(op3); + + MlirRegion region2 = mlirOperationGetRegion(op2, 0); + MlirBlock block0 = mlirRegionGetFirstBlock(region2); + MlirBlock block1 = mlirBlockGetNextInRegion(block0); + + // Test move operations. + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + mlirRewriterBaseMoveOpBefore(rewriter, op3, op1); + mlirRewriterBaseMoveOpAfter(rewriter, op4, op1); + mlirRewriterBaseMoveBlockBefore(rewriter, block1, block0); + + mlirOperationDump(op); + // CHECK-NEXT: module { + // CHECK-NEXT: "dialect.op3"() : () -> () + // CHECK-NEXT: "dialect.op1"() : () -> () + // CHECK-NEXT: "dialect.op4"() : () -> () + // CHECK-NEXT: "dialect.op2"() ({ + // CHECK-NEXT: ^{{.*}}(%[[arg0:.*]]: index): + // CHECK-NEXT: "dialect.op2_2"(%[[arg0]]) : (index) -> () + // CHECK-NEXT: ^{{.*}}(%[[arg1:.*]]: index): // no predecessors + // CHECK-NEXT: "dialect.op2_1"(%[[arg1]]) : (index) -> () + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: } + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testOpModification(MlirContext ctx) { + // CHECK-LABEL: @testOpModification + fprintf(stderr, "@testOpModification\n"); + + const char *moduleString = + "%x, %y = \"dialect.op1\"() : () -> (index, index)\n" + "\"dialect.op2\"(%x) : (index) -> ()\n"; + + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + // get a handle to all operations/values + MlirOperation op1 = mlirBlockGetFirstOperation(body); + MlirValue y = mlirOperationGetResult(op1, 1); + MlirOperation op2 = mlirOperationGetNextInBlock(op1); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + mlirRewriterBaseStartOpModification(rewriter, op1); + mlirRewriterBaseCancelOpModification(rewriter, op1); + + mlirRewriterBaseStartOpModification(rewriter, op2); + mlirOperationSetOperand(op2, 0, y); + mlirRewriterBaseFinalizeOpModification(rewriter, op2); + + mlirOperationDump(op); + // CHECK-NEXT: module { + // CHECK-NEXT: %[[xy:.*]]:2 = "dialect.op1"() : () -> (index, index) + // CHECK-NEXT: "dialect.op2"(%[[xy]]#1) : (index) -> () + // CHECK-NEXT: } + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +void testReplaceUses(MlirContext ctx) { + // CHECK-LABEL: @testReplaceUses + fprintf(stderr, "@testReplaceUses\n"); + + const char *moduleString = + // Replace values with values + "%x1, %y1, %z1 = \"dialect.op1\"() : () -> (index, index, index)\n" + "%x2, %y2, %z2 = \"dialect.op2\"() : () -> (index, index, index)\n" + "\"dialect.op1_uses\"(%x1, %y1, %z1) : (index, index, index) -> ()\n" + // Replace op with values + "%x3 = \"dialect.op3\"() : () -> index\n" + "%x4 = \"dialect.op4\"() : () -> index\n" + "\"dialect.op3_uses\"(%x3) : (index) -> ()\n" + // Replace op with op + "%x5 = \"dialect.op5\"() : () -> index\n" + "%x6 = \"dialect.op6\"() : () -> index\n" + "\"dialect.op5_uses\"(%x5) : (index) -> ()\n" + // Replace op in block; + "%x7 = \"dialect.op7\"() : () -> index\n" + "%x8 = \"dialect.op8\"() : () -> index\n" + "\"dialect.op9\"() ({\n" + "^bb0:\n" + " \"dialect.op7_uses\"(%x7) : (index) -> ()\n" + "}): () -> ()\n" + "\"dialect.op7_uses\"(%x7) : (index) -> ()\n" + // Replace value with value except in op + "%x10 = \"dialect.op10\"() : () -> index\n" + "%x11 = \"dialect.op11\"() : () -> index\n" + "\"dialect.op10_uses\"(%x10) : (index) -> ()\n" + "\"dialect.op10_uses\"(%x10) : (index) -> ()\n"; + + MlirModule module = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); + MlirOperation op = mlirModuleGetOperation(module); + MlirBlock body = mlirModuleGetBody(module); + + // get a handle to all operations/values + MlirOperation op1 = mlirBlockGetFirstOperation(body); + MlirValue x1 = mlirOperationGetResult(op1, 0); + MlirValue y1 = mlirOperationGetResult(op1, 1); + MlirValue z1 = mlirOperationGetResult(op1, 2); + MlirOperation op2 = mlirOperationGetNextInBlock(op1); + MlirValue x2 = mlirOperationGetResult(op2, 0); + MlirValue y2 = mlirOperationGetResult(op2, 1); + MlirValue z2 = mlirOperationGetResult(op2, 2); + MlirOperation op1Uses = mlirOperationGetNextInBlock(op2); + + MlirOperation op3 = mlirOperationGetNextInBlock(op1Uses); + MlirOperation op4 = mlirOperationGetNextInBlock(op3); + MlirValue x4 = mlirOperationGetResult(op4, 0); + MlirOperation op3Uses = mlirOperationGetNextInBlock(op4); + + MlirOperation op5 = mlirOperationGetNextInBlock(op3Uses); + MlirOperation op6 = mlirOperationGetNextInBlock(op5); + MlirOperation op5Uses = mlirOperationGetNextInBlock(op6); + + MlirOperation op7 = mlirOperationGetNextInBlock(op5Uses); + MlirOperation op8 = mlirOperationGetNextInBlock(op7); + MlirValue x8 = mlirOperationGetResult(op8, 0); + MlirOperation op9 = mlirOperationGetNextInBlock(op8); + MlirRegion region9 = mlirOperationGetRegion(op9, 0); + MlirBlock block9 = mlirRegionGetFirstBlock(region9); + MlirOperation op7Uses = mlirOperationGetNextInBlock(op9); + + MlirOperation op10 = mlirOperationGetNextInBlock(op7Uses); + MlirValue x10 = mlirOperationGetResult(op10, 0); + MlirOperation op11 = mlirOperationGetNextInBlock(op10); + MlirValue x11 = mlirOperationGetResult(op11, 0); + MlirOperation op10Uses1 = mlirOperationGetNextInBlock(op11); + + MlirRewriterBase rewriter = mlirIRRewriterCreate(ctx); + + // Replace values + mlirRewriterBaseReplaceAllUsesWith(rewriter, x1, x2); + MlirValue y1z1[2] = {y1, z1}; + MlirValue y2z2[2] = {y2, z2}; + mlirRewriterBaseReplaceAllValueRangeUsesWith(rewriter, 2, y1z1, y2z2); + + // Replace op with values + mlirRewriterBaseReplaceOpWithValues(rewriter, op3, 1, &x4); + + // Replace op with op + mlirRewriterBaseReplaceOpWithOperation(rewriter, op5, op6); + + // Replace op with op in block + mlirRewriterBaseReplaceOpUsesWithinBlock(rewriter, op7, 1, &x8, block9); + + // Replace value with value except in op + mlirRewriterBaseReplaceAllUsesExcept(rewriter, x10, x11, op10Uses1); + + mlirOperationDump(op); + // clang-format off + // CHECK-NEXT: module { + // CHECK-NEXT: %{{.*}}:3 = "dialect.op1"() : () -> (index, index, index) + // CHECK-NEXT: %[[res2:.*]]:3 = "dialect.op2"() : () -> (index, index, index) + // CHECK-NEXT: "dialect.op1_uses"(%[[res2]]#0, %[[res2]]#1, %[[res2]]#2) : (index, index, index) -> () + // CHECK-NEXT: %[[res4:.*]] = "dialect.op4"() : () -> index + // CHECK-NEXT: "dialect.op3_uses"(%[[res4]]) : (index) -> () + // CHECK-NEXT: %[[res6:.*]] = "dialect.op6"() : () -> index + // CHECK-NEXT: "dialect.op5_uses"(%[[res6]]) : (index) -> () + // CHECK-NEXT: %[[res7:.*]] = "dialect.op7"() : () -> index + // CHECK-NEXT: %[[res8:.*]] = "dialect.op8"() : () -> index + // CHECK-NEXT: "dialect.op9"() ({ + // CHECK-NEXT: "dialect.op7_uses"(%[[res8]]) : (index) -> () + // CHECK-NEXT: }) : () -> () + // CHECK-NEXT: "dialect.op7_uses"(%[[res7]]) : (index) -> () + // CHECK-NEXT: %[[res10:.*]] = "dialect.op10"() : () -> index + // CHECK-NEXT: %[[res11:.*]] = "dialect.op11"() : () -> index + // CHECK-NEXT: "dialect.op10_uses"(%[[res10]]) : (index) -> () + // CHECK-NEXT: "dialect.op10_uses"(%[[res11]]) : (index) -> () + // CHECK-NEXT: } + // clang-format on + + mlirIRRewriterDestroy(rewriter); + mlirModuleDestroy(module); +} + +int main(void) { + MlirContext ctx = mlirContextCreate(); + mlirContextSetAllowUnregisteredDialects(ctx, true); + mlirContextGetOrLoadDialect(ctx, mlirStringRefCreateFromCString("builtin")); + + testInsertionPoint(ctx); + testCreateBlock(ctx); + testInlineRegionBlock(ctx); + testReplaceOp(ctx); + testErase(ctx); + testMove(ctx); + testOpModification(ctx); + testReplaceUses(ctx); + + mlirContextDestroy(ctx); + return 0; +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 45009a78aa49f9..df95e5db11f1e0 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -105,6 +105,7 @@ set(MLIR_TEST_DEPENDS mlir-capi-llvm-test mlir-capi-pass-test mlir-capi-quant-test + mlir-capi-rewrite-test mlir-capi-sparse-tensor-test mlir-capi-transform-test mlir-capi-transform-interpreter-test diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir index 91ef571cb3bf71..6b9df32fe02dd3 100644 --- a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir +++ b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir @@ -1,8 +1,8 @@ // RUN: mlir-opt -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s -// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1' %s | FileCheck %s --check-prefix=BAREPTR +// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1' -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=BAREPTR -// RUN: mlir-opt -transform-interpreter %s | FileCheck %s --check-prefix=BAREPTR +// RUN: mlir-opt -transform-interpreter -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=BAREPTR // These tests were separated from func-memref.mlir because applying // -reconcile-unrealized-casts resulted in `llvm.extractvalue` ops getting diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir index e1babdd2f1f63a..3f4e70a6835af5 100644 --- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir @@ -133,7 +133,7 @@ func.func @materialize_read(%M: index, %N: index, %O: index, %P: index) { affine.for %i1 = 0 to %N { affine.for %i2 = 0 to %O { affine.for %i3 = 0 to %P step 5 { - %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref, vector<5x4x3xf32> + %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {in_bounds = [false, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref, vector<5x4x3xf32> // Add a dummy use to prevent dead code elimination from removing // transfer read ops. "dummy_use"(%f) : (vector<5x4x3xf32>) -> () @@ -507,7 +507,7 @@ func.func @transfer_read_with_tensor(%arg: tensor) -> vector<1xf32> { // CHECK-NEXT: %[[RESULT:.*]] = vector.broadcast %[[EXTRACTED]] : f32 to vector<1xf32> // CHECK-NEXT: return %[[RESULT]] : vector<1xf32> %f0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg[], %f0 {permutation_map = affine_map<()->(0)>} : + %0 = vector.transfer_read %arg[], %f0 {in_bounds = [true], permutation_map = affine_map<()->(0)>} : tensor, vector<1xf32> return %0: vector<1xf32> } @@ -746,7 +746,7 @@ func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 - %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1} + %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {in_bounds = [false, true, true, false], permutation_map = #map1} : memref<1x1x1x1xi32>, vector<1x1x1x1xi32> return %3 : vector<1x1x1x1xi32> } diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir index 0d67851dfe41de..667aad7645c51c 100644 --- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir +++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir @@ -483,6 +483,30 @@ func.func @shuffle(%v0 : vector<1xi32>, %v1: vector<1xi32>) -> vector<2xi32> { // ----- +// CHECK-LABEL: func @shuffle +// CHECK-SAME: %[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32> +// CHECK: %[[EXTR:.+]] = spirv.CompositeExtract %[[ARG0]][0 : i32] : vector<4xi32> +// CHECK: %[[RES:.+]] = builtin.unrealized_conversion_cast %[[EXTR]] : i32 to vector<1xi32> +// CHECK: return %[[RES]] : vector<1xi32> +func.func @shuffle(%v0 : vector<4xi32>, %v1: vector<4xi32>) -> vector<1xi32> { + %shuffle = vector.shuffle %v0, %v1 [0] : vector<4xi32>, vector<4xi32> + return %shuffle : vector<1xi32> +} + +// ----- + +// CHECK-LABEL: func @shuffle +// CHECK-SAME: %[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32> +// CHECK: %[[EXTR:.+]] = spirv.CompositeExtract %[[ARG1]][1 : i32] : vector<4xi32> +// CHECK: %[[RES:.+]] = builtin.unrealized_conversion_cast %[[EXTR]] : i32 to vector<1xi32> +// CHECK: return %[[RES]] : vector<1xi32> +func.func @shuffle(%v0 : vector<4xi32>, %v1: vector<4xi32>) -> vector<1xi32> { + %shuffle = vector.shuffle %v0, %v1 [5] : vector<4xi32>, vector<4xi32> + return %shuffle : vector<1xi32> +} + +// ----- + // CHECK-LABEL: func @interleave // CHECK-SAME: (%[[ARG0:.+]]: vector<2xf32>, %[[ARG1:.+]]: vector<2xf32>) // CHECK: %[[SHUFFLE:.*]] = spirv.VectorShuffle [0 : i32, 2 : i32, 1 : i32, 3 : i32] %[[ARG0]], %[[ARG1]] : vector<2xf32>, vector<2xf32> -> vector<4xf32> diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir index 9244604128cb72..0a077624d18f88 100644 --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir @@ -22,7 +22,7 @@ func.func @vec1d_1(%A : memref, %B : memref) { // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector %a0 = affine.load %A[%c0, %c0] : memref } @@ -425,7 +425,7 @@ func.func @vec_rejected_8(%A : memref, %B : memref) { // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}} affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector %a18 = affine.load %A[%c0, %c0] : memref @@ -459,7 +459,7 @@ func.func @vec_rejected_9(%A : memref, %B : memref) { // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 -// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> +// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true], permutation_map = #[[$map_proj_d0d1_0]]} : memref, vector<128xf32> affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}} affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector %a18 = affine.load %A[%c0, %c0] : memref diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir index 83916e755363ba..eb5120a49e3d4b 100644 --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir @@ -123,8 +123,8 @@ func.func @vectorize_matmul(%arg0: memref, %arg1: memref, %arg // VECT: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 { // VECT-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 { // VECT-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) { - // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> - // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> + // VECT: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {in_bounds = [true, false], permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref, vector<4x8xf32> + // VECT: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {in_bounds = [false, true], permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref, vector<4x8xf32> // VECT-NEXT: %[[C:.*]] = arith.mulf %[[B]], %[[A]] : vector<4x8xf32> // VECT: %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref, vector<4x8xf32> // VECT-NEXT: %[[E:.*]] = arith.addf %[[D]], %[[C]] : vector<4x8xf32> diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir index 15a7133cf0f65f..16ade6455d6974 100644 --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir @@ -141,7 +141,7 @@ func.func @affine_map_with_expr_2(%arg0: memref<8x12x16xf32>, %arg1: memref<8x24 // CHECK-NEXT: %[[S1:.*]] = affine.apply #[[$MAP_ID4]](%[[ARG3]], %[[ARG4]], %[[I0]]) // CHECK-NEXT: %[[S2:.*]] = affine.apply #[[$MAP_ID5]](%[[ARG3]], %[[ARG4]], %[[I0]]) // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK-NEXT: %[[S3:.*]] = vector.transfer_read %[[ARG0]][%[[S0]], %[[S1]], %[[S2]]], %[[CST]] {permutation_map = #[[$MAP_ID6]]} : memref<8x12x16xf32>, vector<8xf32> +// CHECK-NEXT: %[[S3:.*]] = vector.transfer_read %[[ARG0]][%[[S0]], %[[S1]], %[[S2]]], %[[CST]] {in_bounds = [true], permutation_map = #[[$MAP_ID6]]} : memref<8x12x16xf32>, vector<8xf32> // CHECK-NEXT: vector.transfer_write %[[S3]], %[[ARG1]][%[[ARG3]], %[[ARG4]], %[[ARG5]]] : vector<8xf32>, memref<8x24x48xf32> // CHECK-NEXT: } // CHECK-NEXT: } diff --git a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir index 5fedd45555fcd8..edffcbdd0ba7d6 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir @@ -154,3 +154,44 @@ func.func @conversion_dealloc_multiple_memrefs_and_retained(%arg0: memref<2xf32> // CHECK-NEXT: memref.store [[DEALLOC_COND]], [[DEALLOC_CONDS_OUT]][[[OUTER_ITER]]] // CHECK-NEXT: } // CHECK-NEXT: return + +// ----- + +// This test check dealloc_helper function is generated on each nested symbol +// table operation when needed and only generated once. +module @conversion_nest_module_dealloc_helper { + func.func @top_level_func(%arg0: memref<2xf32>, %arg1: memref<5xf32>, %arg2: memref<1xf32>, %arg3: i1, %arg4: i1, %arg5: memref<2xf32>) -> (i1, i1) { + %0:2 = bufferization.dealloc (%arg0, %arg1 : memref<2xf32>, memref<5xf32>) if (%arg3, %arg4) retain (%arg2, %arg5 : memref<1xf32>, memref<2xf32>) + func.return %0#0, %0#1 : i1, i1 + } + module @nested_module_not_need_dealloc_helper { + func.func @nested_module_not_need_dealloc_helper_func(%arg0: memref<2xf32>, %arg1: memref<1xf32>, %arg2: i1, %arg3: memref<2xf32>) -> (i1, i1) { + %0:2 = bufferization.dealloc (%arg0 : memref<2xf32>) if (%arg2) retain (%arg1, %arg3 : memref<1xf32>, memref<2xf32>) + return %0#0, %0#1 : i1, i1 + } + } + module @nested_module_need_dealloc_helper { + func.func @nested_module_need_dealloc_helper_func0(%arg0: memref<2xf32>, %arg1: memref<5xf32>, %arg2: memref<1xf32>, %arg3: i1, %arg4: i1, %arg5: memref<2xf32>) -> (i1, i1) { + %0:2 = bufferization.dealloc (%arg0, %arg1 : memref<2xf32>, memref<5xf32>) if (%arg3, %arg4) retain (%arg2, %arg5 : memref<1xf32>, memref<2xf32>) + func.return %0#0, %0#1 : i1, i1 + } + func.func @nested_module_need_dealloc_helper_func1(%arg0: memref<2xf32>, %arg1: memref<5xf32>, %arg2: memref<1xf32>, %arg3: i1, %arg4: i1, %arg5: memref<2xf32>) -> (i1, i1) { + %0:2 = bufferization.dealloc (%arg0, %arg1 : memref<2xf32>, memref<5xf32>) if (%arg3, %arg4) retain (%arg2, %arg5 : memref<1xf32>, memref<2xf32>) + func.return %0#0, %0#1 : i1, i1 + } + } +} + +// CHECK: module @conversion_nest_module_dealloc_helper { +// CHECK: func.func @top_level_func +// CHECK: call @dealloc_helper +// CHECK: module @nested_module_not_need_dealloc_helper { +// CHECK: func.func @nested_module_not_need_dealloc_helper_func +// CHECK-NOT: @dealloc_helper +// CHECK: module @nested_module_need_dealloc_helper { +// CHECK: func.func @nested_module_need_dealloc_helper_func0 +// CHECK: call @dealloc_helper +// CHECK: func.func @nested_module_need_dealloc_helper_func1 +// CHECK: call @dealloc_helper +// CHECK: func.func private @dealloc_helper +// CHECK: func.func private @dealloc_helper diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir index 241b8a486c012e..44c15c272bb3ef 100644 --- a/mlir/test/Dialect/Linalg/hoisting.mlir +++ b/mlir/test/Dialect/Linalg/hoisting.mlir @@ -200,7 +200,7 @@ func.func @hoist_vector_transfer_pairs_in_affine_loops(%memref0: memref<64x64xi3 affine.for %arg3 = 0 to 64 { affine.for %arg4 = 0 to 64 step 16 { affine.for %arg5 = 0 to 64 { - %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32> + %0 = vector.transfer_read %memref0[%arg3, %arg5], %c0_i32 {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (0)>} : memref<64x64xi32>, vector<16xi32> %1 = vector.transfer_read %memref1[%arg5, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32> %2 = vector.transfer_read %memref2[%arg3, %arg4], %c0_i32 : memref<64x64xi32>, vector<16xi32> %3 = arith.muli %0, %1 : vector<16xi32> diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index bbeccc7fecd68b..783149971f0d60 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -130,7 +130,7 @@ func.func @vectorize_dynamic_1d_broadcast(%arg0: tensor, // CHECK-LABEL: @vectorize_dynamic_1d_broadcast // CHECK: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor -// CHECK: %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {permutation_map = #{{.*}}} : tensor, vector<4xf32> +// CHECK: %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {in_bounds = {{.*}}, permutation_map = #{{.*}}} : tensor, vector<4xf32> // CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1> // CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> // CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 459ccd73cfe619..268946803de7a5 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1846,3 +1846,21 @@ func.func @index_switch_fold() -> (f32, f32) { // CHECK-NEXT: %[[c1:.*]] = arith.constant 1.000000e+00 : f32 // CHECK-NEXT: %[[c42:.*]] = arith.constant 4.200000e+01 : f32 // CHECK-NEXT: return %[[c1]], %[[c42]] : f32, f32 + +// ----- + +func.func @index_switch_fold_no_res() { + %c1 = arith.constant 1 : index + scf.index_switch %c1 + case 0 { + scf.yield + } + default { + "test.op"() : () -> () + scf.yield + } + return +} + +// CHECK-LABEL: func.func @index_switch_fold_no_res() +// CHECK-NEXT: "test.op"() : () -> () diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index db169a6c1f8ae4..208982a3e0e7b5 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -478,7 +478,7 @@ func.func @test_vector.transfer_read(%arg0: memref>) { %c3 = arith.constant 3 : index %f0 = arith.constant 0.0 : f32 %vf0 = vector.splat %f0 : vector<2x3xf32> - // expected-error@+1 {{ expects the optional in_bounds attr of same rank as permutation_map results: affine_map<(d0, d1) -> (d0, d1)>}} + // expected-error@+1 {{ expects the in_bounds attr of same rank as permutation_map results: affine_map<(d0, d1) -> (d0, d1)>}} %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {in_bounds = [true], permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref>, vector<1x1x2x3xf32> } diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index 531e2db6364314..7e578452b82cc0 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -70,7 +70,7 @@ func.func @vector_transfer_ops(%arg0: memref, // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref, vector<5xf32> %8 = vector.transfer_read %arg0[%c3, %c3], %f0, %m : memref, vector<5xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]], %[[C3]]], %{{.*}}, %{{.*}} : memref, vector<5x4x8xf32> - %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>} : memref, vector<5x4x8xf32> + %9 = vector.transfer_read %arg4[%c3, %c3, %c3], %f0, %m2 {in_bounds = [false, false, true], permutation_map = affine_map<(d0, d1, d2)->(d1, d0, 0)>} : memref, vector<5x4x8xf32> // CHECK: vector.transfer_write vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, memref diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir index 3c0414c83ed68f..3f36756fe95d77 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir @@ -1,16 +1,14 @@ // RUN: mlir-opt %s -test-vector-transfer-collapse-inner-most-dims -split-input-file | FileCheck %s -// TODO: Unify how memref and vectors are named - //----------------------------------------------------------------------------- // 1. vector.transfer_read //----------------------------------------------------------------------------- -func.func @contiguous_inner_most(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ +func.func @contiguous_inner_most(%src: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> - return %0 : vector<1x8x1xf32> + %pad = arith.constant 0.0 : f32 + %v = vector.transfer_read %src[%c0, %c0, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> + return %v : vector<1x8x1xf32> } // CHECK: func @contiguous_inner_most(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> @@ -23,13 +21,13 @@ func.func @contiguous_inner_most(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1 // Same as the top example within this split, but with the inner vector // dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_scalable_inner_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x[8]x1xf32>{ +func.func @contiguous_inner_most_scalable_inner_dim(%src: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x[8]x1xf32>{ %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x[8]x1xf32> - return %0 : vector<1x[8]x1xf32> + %pad = arith.constant 0.0 : f32 + %v = vector.transfer_read %src[%c0, %c0, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x[8]x1xf32> + return %v : vector<1x[8]x1xf32> } // CHECK: func @contiguous_inner_most_scalable_inner_dim(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> @@ -43,11 +41,11 @@ func.func @contiguous_inner_most_scalable_inner_dim(%in: memref<1x1x8x1xf32, str // Same as the top example within this split, but the trailing unit dim was // replaced with a dyn dim - not supported -func.func @negative_dynamic_trailing_dim(%in: memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ +func.func @negative_dynamic_trailing_dim(%src: memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> - return %0 : vector<1x8x1xf32> + %pad = arith.constant 0.0 : f32 + %v = vector.transfer_read %src[%c0, %c0, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> + return %v : vector<1x8x1xf32> } // CHECK-LABEL: func @negative_dynamic_trailing_dim @@ -57,11 +55,11 @@ func.func @negative_dynamic_trailing_dim(%in: memref<1x1x8x?xf32, strided<[3072, // Same as the top example within this split, but with a "scalable unit" dim in // the output vector - not supported (scalable 1, [1], is _not_ a unit dimension). -func.func @negative_scalable_one_trailing_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x[1]xf32>{ +func.func @negative_scalable_one_trailing_dim(%src: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x[1]xf32>{ %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32> - return %0 : vector<1x8x[1]xf32> + %pad = arith.constant 0.0 : f32 + %v = vector.transfer_read %src[%c0, %c0, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32> + return %v : vector<1x8x[1]xf32> } // CHECK-LABEL: func @negative_scalable_one_trailing_dim // CHECK-NOT: memref.subview @@ -69,10 +67,10 @@ func.func @negative_scalable_one_trailing_dim(%in: memref<1x1x8x1xf32, strided<[ // ----- -func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %memref: memref) -> vector<8x1xf32> { +func.func @contiguous_inner_most_dynamic_outer(%i: index, %ii: index, %memref: memref) -> vector<8x1xf32> { %c0 = arith.constant 0 : index %pad = arith.constant 0.0 : f32 - %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref, vector<8x1xf32> + %v = vector.transfer_read %memref[%i, %ii, %c0, %c0], %pad {in_bounds = [true, true]} : memref, vector<8x1xf32> return %v : vector<8x1xf32> } // CHECK: func.func @contiguous_inner_most_dynamic_outer @@ -93,12 +91,12 @@ func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %memref: me // Same as the top example within this split, but with the outer vector // dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim(%a: index, %b: index, %memref: memref) -> vector<[8]x1xf32> { +func.func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim(%i: index, %ii: index, %memref: memref) -> vector<[8]x1xf32> { %c0 = arith.constant 0 : index %pad = arith.constant 0.0 : f32 - %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref, vector<[8]x1xf32> + %v = vector.transfer_read %memref[%i, %ii, %c0, %c0], %pad {in_bounds = [true, true]} : memref, vector<[8]x1xf32> return %v : vector<[8]x1xf32> } // CHECK-LABEL: func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim @@ -118,11 +116,11 @@ func.func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim(%a: index, %b: // The index to be dropped is == 0, so it's safe to collapse. The other index // should be preserved correctly. -func.func @contiguous_inner_most_zero_idx_in_bounds(%A: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { +func.func @contiguous_inner_most_zero_idx_in_bounds(%src: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { %pad = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %1 = vector.transfer_read %A[%i, %c0], %pad {in_bounds = [true, true]} : memref<16x1xf32>, vector<8x1xf32> - return %1 : vector<8x1xf32> + %v = vector.transfer_read %src[%i, %c0], %pad {in_bounds = [true, true]} : memref<16x1xf32>, vector<8x1xf32> + return %v : vector<8x1xf32> } // CHECK-LABEL: func.func @contiguous_inner_most_zero_idx_in_bounds( // CHECK-SAME: %[[MEM:.*]]: memref<16x1xf32>, @@ -135,11 +133,11 @@ func.func @contiguous_inner_most_zero_idx_in_bounds(%A: memref<16x1xf32>, %i:ind // The index to be dropped is == 0, so it's safe to collapse. The "out of // bounds" attribute is too conservative and will be folded to "in bounds" // before the pattern runs. The other index should be preserved correctly. -func.func @contiguous_inner_most_zero_idx_out_of_bounds(%A: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { +func.func @contiguous_inner_most_zero_idx_out_of_bounds(%src: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { %pad = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %1 = vector.transfer_read %A[%i, %c0], %pad {in_bounds = [true, false]} : memref<16x1xf32>, vector<8x1xf32> - return %1 : vector<8x1xf32> + %v = vector.transfer_read %src[%i, %c0], %pad {in_bounds = [true, false]} : memref<16x1xf32>, vector<8x1xf32> + return %v : vector<8x1xf32> } // CHECK-LABEL: func.func @contiguous_inner_most_zero_idx_out_of_bounds( // CHECK-SAME: %[[MEM:.*]]: memref<16x1xf32>, @@ -151,10 +149,10 @@ func.func @contiguous_inner_most_zero_idx_out_of_bounds(%A: memref<16x1xf32>, %i // The index to be dropped is unknown, but since it's "in bounds", it has to be // == 0. It's safe to collapse the corresponding dim. -func.func @contiguous_inner_most_non_zero_idx_in_bounds(%A: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { +func.func @contiguous_inner_most_non_zero_idx_in_bounds(%src: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { %pad = arith.constant 0.0 : f32 - %1 = vector.transfer_read %A[%i, %i], %pad {in_bounds = [true, true]} : memref<16x1xf32>, vector<8x1xf32> - return %1 : vector<8x1xf32> + %v = vector.transfer_read %src[%i, %i], %pad {in_bounds = [true, true]} : memref<16x1xf32>, vector<8x1xf32> + return %v : vector<8x1xf32> } // CHECK-LABEL: func.func @contiguous_inner_most_non_zero_idx_in_bounds( // CHECK-SAME: %[[MEM:.*]]: memref<16x1xf32>, @@ -164,27 +162,43 @@ func.func @contiguous_inner_most_non_zero_idx_in_bounds(%A: memref<16x1xf32>, %i // CHECK: %[[READ:.*]] = vector.transfer_read %[[SV]]{{\[}}%[[IDX]]], %[[PAD]] {in_bounds = [true]} : memref<16xf32, strided<[1]>>, vector<8xf32> // CHECK: vector.shape_cast %[[READ]] : vector<8xf32> to vector<8x1xf32> +// Same as the top example within this split, but with the outer vector +// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. +// vscale = 1). This is assumed via the `in_bounds` attribute. + +func.func @contiguous_inner_most_non_zero_idx_in_bounds_scalable(%src: memref<16x1xf32>, %i:index) -> (vector<[8]x1xf32>) { + %pad = arith.constant 0.0 : f32 + %v = vector.transfer_read %src[%i, %i], %pad {in_bounds = [true, true]} : memref<16x1xf32>, vector<[8]x1xf32> + return %v : vector<[8]x1xf32> +} +// CHECK-LABEL: func.func @contiguous_inner_most_non_zero_idx_in_bounds_scalable +// CHECK-SAME: %[[MEM:.*]]: memref<16x1xf32>, +// CHECK-SAME: %[[IDX:.*]]: index) -> vector<[8]x1xf32> { +// CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[SV:.*]] = memref.subview %[[MEM]][0, 0] [16, 1] [1, 1] : memref<16x1xf32> to memref<16xf32, strided<[1]>> +// CHECK: %[[READ:.*]] = vector.transfer_read %[[SV]]{{\[}}%[[IDX]]], %[[PAD]] {in_bounds = [true]} : memref<16xf32, strided<[1]>>, vector<[8]xf32> +// CHECK: vector.shape_cast %[[READ]] : vector<[8]xf32> to vector<[8]x1xf32> + // The index to be dropped is unknown and "out of bounds" - not safe to // collapse. -func.func @negative_contiguous_inner_most_non_zero_idx_out_of_bounds(%A: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { +func.func @negative_contiguous_inner_most_non_zero_idx_out_of_bounds(%src: memref<16x1xf32>, %i:index) -> (vector<8x1xf32>) { %pad = arith.constant 0.0 : f32 - %1 = vector.transfer_read %A[%i, %i], %pad {in_bounds = [true, false]} : memref<16x1xf32>, vector<8x1xf32> - return %1 : vector<8x1xf32> + %v = vector.transfer_read %src[%i, %i], %pad {in_bounds = [true, false]} : memref<16x1xf32>, vector<8x1xf32> + return %v : vector<8x1xf32> } // CHECK-LABEL: func.func @negative_contiguous_inner_most_non_zero_idx_out_of_bounds( // CHECK-NOT: memref.subview // CHECK-NOT: memref.shape_cast // CHECK: vector.transfer_read - // ----- -func.func @contiguous_inner_most_dim_with_subview(%A: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<4x1xf32>) { +func.func @contiguous_inner_most_dim_with_subview(%src: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<4x1xf32>) { %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = memref.subview %A[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> - %1 = vector.transfer_read %0[%ii, %c0], %cst {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<4x1xf32> - return %1 : vector<4x1xf32> + %pad = arith.constant 0.0 : f32 + %sv = memref.subview %src[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> + %v = vector.transfer_read %sv[%ii, %c0], %pad {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<4x1xf32> + return %v : vector<4x1xf32> } // CHECK: func @contiguous_inner_most_dim_with_subview(%[[SRC:.+]]: memref<1000x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1xf32> // CHECK: %[[SRC_0:.+]] = memref.subview %[[SRC]] @@ -195,14 +209,14 @@ func.func @contiguous_inner_most_dim_with_subview(%A: memref<1000x1xf32>, %i:ind // Same as the top example within this split, but with the outer vector // dim scalable. Note that this example only makes sense when "4 = [4]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim(%A: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<[4]x1xf32>) { +func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim(%src: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<[4]x1xf32>) { %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = memref.subview %A[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> - %1 = vector.transfer_read %0[%ii, %c0], %cst {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<[4]x1xf32> - return %1 : vector<[4]x1xf32> + %pad = arith.constant 0.0 : f32 + %sv = memref.subview %src[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> + %v = vector.transfer_read %sv[%ii, %c0], %pad {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<[4]x1xf32> + return %v : vector<[4]x1xf32> } // CHECK-LABEL: func @contiguous_inner_most_dim_with_subview_scalable_inner_dim // CHECK-SAME: %[[SRC:.+]]: memref<1000x1xf32> @@ -213,12 +227,12 @@ func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim(%A: memref< // ----- -func.func @contiguous_inner_most_dim_with_subview_2d(%A: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<4x1x1xf32>) { +func.func @contiguous_inner_most_dim_with_subview_2d(%src: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<4x1x1xf32>) { %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = memref.subview %A[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> - %1 = vector.transfer_read %0[%ii, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<4x1x1xf32> - return %1 : vector<4x1x1xf32> + %pad = arith.constant 0.0 : f32 + %sv = memref.subview %src[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> + %v = vector.transfer_read %sv[%ii, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<4x1x1xf32> + return %v : vector<4x1x1xf32> } // CHECK: func @contiguous_inner_most_dim_with_subview_2d(%[[SRC:.+]]: memref<1000x1x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1x1xf32> // CHECK: %[[SRC_0:.+]] = memref.subview %[[SRC]] @@ -229,14 +243,14 @@ func.func @contiguous_inner_most_dim_with_subview_2d(%A: memref<1000x1x1xf32>, % // Same as the top example within this split, but with the outer vector // dim scalable. Note that this example only makes sense when "4 = [4]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_dim_with_subview_2d_scalable_inner_dim(%A: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<[4]x1x1xf32>) { +func.func @contiguous_inner_most_dim_with_subview_2d_scalable_inner_dim(%src: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<[4]x1x1xf32>) { %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f32 - %0 = memref.subview %A[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> - %1 = vector.transfer_read %0[%ii, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<[4]x1x1xf32> - return %1 : vector<[4]x1x1xf32> + %pad = arith.constant 0.0 : f32 + %sv = memref.subview %src[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> + %v = vector.transfer_read %sv[%ii, %c0, %c0], %pad {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<[4]x1x1xf32> + return %v : vector<[4]x1x1xf32> } // CHECK-LABEL: func @contiguous_inner_most_dim_with_subview_2d_scalable_inner_dim( // CHECK-SAME: %[[SRC:.+]]: memref<1000x1x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<[4]x1x1xf32> @@ -251,11 +265,11 @@ func.func @contiguous_inner_most_dim_with_subview_2d_scalable_inner_dim(%A: memr // NOTE: This is an out-of-bounds access. -func.func @negative_non_unit_inner_vec_dim(%arg0: memref<4x1xf32>) -> vector<4x8xf32> { +func.func @negative_non_unit_inner_vec_dim(%src: memref<4x1xf32>) -> vector<4x8xf32> { %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cst : memref<4x1xf32>, vector<4x8xf32> - return %0 : vector<4x8xf32> + %pad = arith.constant 0.000000e+00 : f32 + %v = vector.transfer_read %src[%c0, %c0], %pad : memref<4x1xf32>, vector<4x8xf32> + return %v : vector<4x8xf32> } // CHECK: func.func @negative_non_unit_inner_vec_dim // CHECK-NOT: memref.subview @@ -263,11 +277,11 @@ func.func @negative_non_unit_inner_vec_dim(%arg0: memref<4x1xf32>) -> vector<4x8 // ----- -func.func @negative_non_unit_inner_memref_dim(%arg0: memref<4x8xf32>) -> vector<4x1xf32> { +func.func @negative_non_unit_inner_memref_dim(%src: memref<4x8xf32>) -> vector<4x1xf32> { %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cst : memref<4x8xf32>, vector<4x1xf32> - return %0 : vector<4x1xf32> + %pad = arith.constant 0.000000e+00 : f32 + %v = vector.transfer_read %src[%c0, %c0], %pad : memref<4x8xf32>, vector<4x1xf32> + return %v : vector<4x1xf32> } // CHECK: func.func @negative_non_unit_inner_memref_dim // CHECK-NOT: memref.subview @@ -275,13 +289,28 @@ func.func @negative_non_unit_inner_memref_dim(%arg0: memref<4x8xf32>) -> vector< // ----- +// The inner most unit dims can not be dropped if the strides are not ones. + +func.func @negative_non_unit_strides(%src: memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>, %i: index) -> vector<16x16x1xf32> { + %c0 = arith.constant 0 : index + %pad = arith.constant 0.000000e+00 : f32 + %v = vector.transfer_read %src[%i, %c0, %c0], %pad + {in_bounds = [true, true, true]} + : memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>, vector<16x16x1xf32> + return %v : vector<16x16x1xf32> +} +// CHECK: func.func @negative_non_unit_strides +// CHECK-NOT: memref.subview + +// ----- + //----------------------------------------------------------------------------- // 2. vector.transfer_write //----------------------------------------------------------------------------- -func.func @contiguous_inner_most(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) { +func.func @contiguous_inner_most(%dest: memref<1x512x16x1x1xf32>, %v: vector<1x16x16x1x1xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0] + vector.transfer_write %v, %dest[%c0, %i, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x16x16x1x1xf32>, memref<1x512x16x1x1xf32> return @@ -299,11 +328,11 @@ func.func @contiguous_inner_most(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector< // Same as the top example within this split, but with the inner vector // dim scalable. Note that this example only makes sense when "16 = [16]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_scalable_inner_dim(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x[16]x1x1xf32>, %arg2: index) { +func.func @contiguous_inner_most_scalable_inner_dim(%dest: memref<1x512x16x1x1xf32>, %v: vector<1x16x[16]x1x1xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0] + vector.transfer_write %v, %dest[%c0, %i, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x16x[16]x1x1xf32>, memref<1x512x16x1x1xf32> return @@ -322,9 +351,9 @@ func.func @contiguous_inner_most_scalable_inner_dim(%arg0: memref<1x512x16x1x1xf // Same as the top example within this split, but the trailing unit dim was // replaced with a dyn dim - not supported -func.func @negative_dynamic_trailing_dim(%arg0: memref<1x512x16x1x?xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) { +func.func @negative_dynamic_trailing_dim(%dest: memref<1x512x16x1x?xf32>, %v: vector<1x16x16x1x1xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0] + vector.transfer_write %v, %dest[%c0, %i, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x16x16x1x1xf32>, memref<1x512x16x1x?xf32> return @@ -336,9 +365,9 @@ func.func @negative_dynamic_trailing_dim(%arg0: memref<1x512x16x1x?xf32>, %arg1: // Same as the top example within this split, but with a "scalable unit" dim in // the input vector - not supported (scalable 1, [1], is _not_ a unit dimension). -func.func @negative_scalable_one_trailing_dim(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x[1]xf32>, %arg2: index) { +func.func @negative_scalable_one_trailing_dim(%dest: memref<1x512x16x1x1xf32>, %v: vector<1x16x16x1x[1]xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0] + vector.transfer_write %v, %dest[%c0, %i, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x16x16x1x[1]xf32>, memref<1x512x16x1x1xf32> return @@ -350,9 +379,9 @@ func.func @negative_scalable_one_trailing_dim(%arg0: memref<1x512x16x1x1xf32>, % // ----- -func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %arg0: memref, %arg1: vector<8x1xf32>) { +func.func @contiguous_inner_most_dynamic_outer(%i: index, %ii: index, %dest: memref, %v: vector<8x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%a, %b, %c0, %c0] {in_bounds = [true, true]} : vector<8x1xf32>, memref + vector.transfer_write %v, %dest[%i, %ii, %c0, %c0] {in_bounds = [true, true]} : vector<8x1xf32>, memref return } // CHECK-LABEL: func.func @contiguous_inner_most_dynamic_outer( @@ -369,11 +398,11 @@ func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %arg0: memr // Same as the top example within this split, but with the outer vector // dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. -// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. +// vscale = 1). This is assumed via the `in_bounds` attribute. -func.func @contiguous_inner_most_dynamic_outer_scalable_inner_dim(%a: index, %b: index, %arg0: memref, %arg1: vector<[8]x1xf32>) { +func.func @contiguous_inner_most_dynamic_outer_scalable_inner_dim(%i: index, %ii: index, %dest: memref, %v: vector<[8]x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%a, %b, %c0, %c0] {in_bounds = [true, true]} : vector<[8]x1xf32>, memref + vector.transfer_write %v, %dest[%i, %ii, %c0, %c0] {in_bounds = [true, true]} : vector<[8]x1xf32>, memref return } // CHECK-LABEL: func.func @contiguous_inner_most_dynamic_outer_scalable_inner_dim( @@ -395,9 +424,9 @@ func.func @contiguous_inner_most_dynamic_outer_scalable_inner_dim(%a: index, %b: // The index to be dropped is == 0, so it's safe to collapse. The other index // should be preserved correctly. -func.func @contiguous_inner_most_zero_idx_in_bounds(%arg0: memref<16x1xf32>, %arg1: vector<8x1xf32>, %i: index) { +func.func @contiguous_inner_most_zero_idx_in_bounds(%dest: memref<16x1xf32>, %v: vector<8x1xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%i, %c0] {in_bounds = [true, true]} : vector<8x1xf32>, memref<16x1xf32> + vector.transfer_write %v, %dest[%i, %c0] {in_bounds = [true, true]} : vector<8x1xf32>, memref<16x1xf32> return } // CHECK-LABEL: func.func @contiguous_inner_most_zero_idx_in_bounds( @@ -411,9 +440,9 @@ func.func @contiguous_inner_most_zero_idx_in_bounds(%arg0: memref<16x1xf32>, %ar // The index to be dropped is == 0, so it's safe to collapse. The "out of // bounds" attribute is too conservative and will be folded to "in bounds" // before the pattern runs. The other index should be preserved correctly. -func.func @contiguous_inner_most_zero_idx_out_of_bounds(%arg0: memref<16x1xf32>, %arg1: vector<8x1xf32>, %i: index) { +func.func @contiguous_inner_most_zero_idx_out_of_bounds(%dest: memref<16x1xf32>, %v: vector<8x1xf32>, %i: index) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%i, %c0] {in_bounds = [true, false]} : vector<8x1xf32>, memref<16x1xf32> + vector.transfer_write %v, %dest[%i, %c0] {in_bounds = [true, false]} : vector<8x1xf32>, memref<16x1xf32> return } // CHECK-LABEL: func.func @contiguous_inner_most_zero_idx_out_of_bounds @@ -426,8 +455,8 @@ func.func @contiguous_inner_most_zero_idx_out_of_bounds(%arg0: memref<16x1xf32>, // The index to be dropped is unknown, but since it's "in bounds", it has to be // == 0. It's safe to collapse the corresponding dim. -func.func @contiguous_inner_most_dim_non_zero_idx_in_bounds(%arg0: memref<16x1xf32>, %arg1: vector<8x1xf32>, %i: index) { - vector.transfer_write %arg1, %arg0[%i, %i] {in_bounds = [true, true]} : vector<8x1xf32>, memref<16x1xf32> +func.func @contiguous_inner_most_dim_non_zero_idx_in_bounds(%dest: memref<16x1xf32>, %v: vector<8x1xf32>, %i: index) { + vector.transfer_write %v, %dest[%i, %i] {in_bounds = [true, true]} : vector<8x1xf32>, memref<16x1xf32> return } // CHECK-LABEL: func @contiguous_inner_most_dim_non_zero_idx_in_bounds @@ -438,10 +467,26 @@ func.func @contiguous_inner_most_dim_non_zero_idx_in_bounds(%arg0: memref<16x1xf // CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<8x1xf32> to vector<8xf32> // CHECK: vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX]]] {in_bounds = [true]} : vector<8xf32>, memref<16xf32, strided<[1]>> +// Same as the top example within this split, but with the outer vector +// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. +// vscale = 1). This is assumed via the `in_bounds` attribute. + +func.func @contiguous_inner_most_non_zero_idx_in_bounds_scalable(%dest: memref<16x1xf32>, %v: vector<[8]x1xf32>, %i: index) { + vector.transfer_write %v, %dest[%i, %i] {in_bounds = [true, true]} : vector<[8]x1xf32>, memref<16x1xf32> + return +} +// CHECK-LABEL: func.func @contiguous_inner_most_non_zero_idx_in_bounds_scalable( +// CHECK-SAME: %[[MEM:.*]]: memref<16x1xf32>, +// CHECK-SAME: %[[VEC:.*]]: vector<[8]x1xf32> +// CHECK-SAME: %[[IDX:.*]]: index) { +// CHECK: %[[SV:.*]] = memref.subview %[[MEM]][0, 0] [16, 1] [1, 1] : memref<16x1xf32> to memref<16xf32, strided<[1]>> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<[8]x1xf32> to vector<[8]xf32> +// CHECK: vector.transfer_write %[[SC]], %[[SV]]{{\[}}%[[IDX]]] {in_bounds = [true]} : vector<[8]xf32>, memref<16xf32, strided<[1]>> + // The index to be dropped is unknown and "out of bounds" - not safe to // collapse. -func.func @negative_contiguous_inner_most_dim_non_zero_idx_out_of_bounds(%arg0: memref<16x1xf32>, %arg1: vector<8x1xf32>, %i: index) { - vector.transfer_write %arg1, %arg0[%i, %i] {in_bounds = [true, false]} : vector<8x1xf32>, memref<16x1xf32> +func.func @negative_contiguous_inner_most_dim_non_zero_idx_out_of_bounds(%dest: memref<16x1xf32>, %v: vector<8x1xf32>, %i: index) { + vector.transfer_write %v, %dest[%i, %i] {in_bounds = [true, false]} : vector<8x1xf32>, memref<16x1xf32> return } // CHECK-LABEL: func @negative_contiguous_inner_most_dim_non_zero_idx_out_of_bounds @@ -451,33 +496,118 @@ func.func @negative_contiguous_inner_most_dim_non_zero_idx_out_of_bounds(%arg0: // ----- -func.func @drop_inner_most_dim(%arg0: memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>>, %arg1: vector<1x16x16x1xf32>, %arg2: index) { +// Verify that the transformation does work even when the input is a "subview" + +func.func @contiguous_inner_most_dim_with_subview(%dest: memref<1000x1xf32>, %i:index, %ii:index, %vec: vector<4x1xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = memref.subview %dest[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> + vector.transfer_write %vec, %0[%ii, %c0] {in_bounds = [true, true]} : vector<4x1xf32>, memref<40x1xf32, strided<[1, 1], offset: ?>> + return +} + +// CHECK-LABEL: func.func @contiguous_inner_most_dim_with_subview( +// CHECK-SAME: %[[MEM:.*]]: memref<1000x1xf32>, +// CHECK-SAME: %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index, +// CHECK-SAME: %[[VEC:.*]]: vector<4x1xf32>) { +// CHECK: %[[SV_1:.*]] = memref.subview %[[MEM]]{{\[}}%[[IDX_1]], 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> +// CHECK: %[[SV_2:.*]] = memref.subview %[[SV_1]][0, 0] [40, 1] [1, 1] : memref<40x1xf32, strided<[1, 1], offset: ?>> to memref<40xf32, strided<[1], offset: ?>> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<4x1xf32> to vector<4xf32> +// CHECK: vector.transfer_write %[[SC]], %[[SV_2]]{{\[}}%[[IDX_2]]] {in_bounds = [true]} : vector<4xf32>, memref<40xf32, strided<[1], offset: ?>> + +// Same as the top example within this split, but with the outer vector +// dim scalable. Note that this example only makes sense when "4 = [4]" (i.e. +// vscale = 1). This is assumed via the `in_bounds` attribute. + +func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim(%dest: memref<1000x1xf32>, %i:index, %ii:index, %vec: vector<[4]x1xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = memref.subview %dest[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> + vector.transfer_write %vec, %0[%ii, %c0] {in_bounds = [true, true]} : vector<[4]x1xf32>, memref<40x1xf32, strided<[1, 1], offset: ?>> + return +} + +// CHECK-LABEL: func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim +// CHECK-SAME: %[[MEM:.*]]: memref<1000x1xf32>, +// CHECK-SAME: %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index, +// CHECK-SAME: %[[VEC:.*]]: vector<[4]x1xf32>) { +// CHECK: %[[SV_1:.*]] = memref.subview %[[MEM]]{{\[}}%[[IDX_1]], 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>> +// CHECK: %[[SV_2:.*]] = memref.subview %[[SV_1]][0, 0] [40, 1] [1, 1] : memref<40x1xf32, strided<[1, 1], offset: ?>> to memref<40xf32, strided<[1], offset: ?>> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<[4]x1xf32> to vector<[4]xf32> +// CHECK: vector.transfer_write %[[SC]], %[[SV_2]]{{\[}}%[[IDX_2]]] {in_bounds = [true]} : vector<[4]xf32>, memref<40xf32, strided<[1], offset: ?>> + +// ----- + +func.func @contiguous_inner_most_dim_with_subview_2d(%dest: memref<1000x1x1xf32>, %i:index, %ii:index, %vec: vector<4x1x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0] - {in_bounds = [true, true, true, true]} - : vector<1x16x16x1xf32>, memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>> + %cst = arith.constant 0.0 : f32 + %0 = memref.subview %dest[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> + vector.transfer_write %vec, %0[%ii, %c0, %c0] {in_bounds = [true, true, true]} : vector<4x1x1xf32>, memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> return } -// CHECK: func.func @drop_inner_most_dim -// CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] -// CHECK-SAME: %[[VEC:[a-zA-Z0-9]+]] -// CHECK-SAME: %[[IDX:[a-zA-Z0-9]+]] -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[DEST]] -// CHECK-SAME: memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>> to memref<1x512x16xf32, strided<[8192, 16, 1], offset: ?>> -// CHECK: %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<1x16x16x1xf32> to vector<1x16x16xf32> -// CHECK: vector.transfer_write %[[CAST]], %[[SUBVIEW]] -// CHECK-SAME: [%[[C0]], %[[IDX]], %[[C0]]] +// CHECK-LABEL: func.func @contiguous_inner_most_dim_with_subview_2d( +// CHECK-SAME: %[[MEM:.*]]: memref<1000x1x1xf32>, +// CHECK-SAME: %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index, +// CHECK-SAME: %[[VEC:.*]]: vector<4x1x1xf32>) { +// CHECK: %[[SV_1:.*]] = memref.subview %[[MEM]]{{\[}}%[[IDX_1]], 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> +// CHECK: %[[SV_2:.*]] = memref.subview %[[SV_1]][0, 0, 0] [40, 1, 1] [1, 1, 1] : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> to memref<40xf32, strided<[1], offset: ?>> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<4x1x1xf32> to vector<4xf32> +// CHECK: vector.transfer_write %[[SC]], %[[SV_2]]{{\[}}%[[IDX_2]]] {in_bounds = [true]} : vector<4xf32>, memref<40xf32, strided<[1], offset: ?>> + +// Same as the top example within this split, but with the outer vector +// dim scalable. Note that this example only makes sense when "4 = [4]" (i.e. +// vscale = 1). This is assumed via the `in_bounds` attribute. + +func.func @contiguous_inner_most_dim_with_subview_2d_scalable(%dest: memref<1000x1x1xf32>, %i:index, %ii:index, %vec: vector<[4]x1x1xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = memref.subview %dest[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> + vector.transfer_write %vec, %0[%ii, %c0, %c0] {in_bounds = [true, true, true]} : vector<[4]x1x1xf32>, memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> + return +} +// CHECK-LABEL: func.func @contiguous_inner_most_dim_with_subview_2d_scalable +// CHECK-SAME: %[[MEM:.*]]: memref<1000x1x1xf32>, +// CHECK-SAME: %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index, +// CHECK-SAME: %[[VEC:.*]]: vector<[4]x1x1xf32>) { +// CHECK: %[[SV_1:.*]] = memref.subview %[[MEM]]{{\[}}%[[IDX_1]], 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> +// CHECK: %[[SV_2:.*]] = memref.subview %[[SV_1]][0, 0, 0] [40, 1, 1] [1, 1, 1] : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>> to memref<40xf32, strided<[1], offset: ?>> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<[4]x1x1xf32> to vector<[4]xf32> +// CHECK: vector.transfer_write %[[SC]], %[[SV_2]]{{\[}}%[[IDX_2]]] {in_bounds = [true]} : vector<[4]xf32>, memref<40xf32, strided<[1], offset: ?>> + +// ----- + +// NOTE: This is an out-of-bounds access. + +func.func @negative_non_unit_inner_vec_dim(%dest: memref<4x1xf32>, %vec: vector<4x8xf32>) { + %c0 = arith.constant 0 : index + vector.transfer_write %vec, %dest[%c0, %c0] : vector<4x8xf32>, memref<4x1xf32> + return +} +// CHECK: func.func @negative_non_unit_inner_vec_dim +// CHECK-NOT: memref.subview +// CHECK: vector.transfer_write // ----- -func.func @non_unit_strides(%arg0: memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>, %arg1: vector<16x16x1xf32>, %arg2: index) { +func.func @negative_non_unit_inner_memref_dim(%dest: memref<4x8xf32>, %vec: vector<4x1xf32>) { %c0 = arith.constant 0 : index - vector.transfer_write %arg1, %arg0[%arg2, %c0, %c0] + vector.transfer_write %vec, %dest[%c0, %c0] : vector<4x1xf32>, memref<4x8xf32> + return +} +// CHECK: func.func @negative_non_unit_inner_memref_dim +// CHECK-NOT: memref.subview +// CHECK: vector.transfer_write + +// ----- + +// The inner most unit dims can not be dropped if the strides are not ones. + +func.func @negative_non_unit_strides(%dest: memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>, %v: vector<16x16x1xf32>, %i: index) { + %c0 = arith.constant 0 : index + vector.transfer_write %v, %dest[%i, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16x1xf32>, memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>> return } -// The inner most unit dims can not be dropped if the strides are not ones. -// CHECK: func.func @non_unit_strides +// CHECK: func.func @negative_non_unit_strides // CHECK-NOT: memref.subview diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir index 35418b38df9b25..b9dcb2b55e52ef 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir @@ -228,7 +228,7 @@ func.func @permutation_with_mask_xfer_read_scalable(%mem: memref, %dim_ func.func @masked_permutation_xfer_read_fixed_width(%arg0: tensor, %mask : vector<4x1xi1>) { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %3 = vector.mask %mask { vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)>} : tensor, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32> + %3 = vector.mask %mask { vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [false, true, false], permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)>} : tensor, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32> call @test.some_use(%3) : (vector<1x4x4xf32>) -> () return } diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir index d169e6d5878e25..2b7906d4dd40cb 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir @@ -28,7 +28,7 @@ func.func @vector_transfer_ops_0d_tensor(%M: tensor) -> vector<1xf32> { // CHECK-NEXT: %[[S:.*]] = tensor.extract %[[SOURCE]][] : tensor // CHECK-NEXT: %[[V:.*]] = vector.broadcast %[[S]] : f32 to vector<1xf32> - %0 = vector.transfer_read %M[], %f0 {permutation_map = affine_map<()->(0)>} : + %0 = vector.transfer_read %M[], %f0 {in_bounds = [true], permutation_map = affine_map<()->(0)>} : tensor, vector<1xf32> // CHECK-NEXT: return %[[V]] @@ -296,8 +296,8 @@ func.func @transfer_read_permutations(%arg0 : memref, %arg1 : memref %mask1 = vector.splat %m : vector<16x14xi1> - %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {permutation_map = #map1} : memref, vector<7x14x8x16xf32> -// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {permutation_map = #[[$MAP0]]} : memref, vector<16x14x7x8xf32> + %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst, %mask1 {in_bounds = [true, false, true, false], permutation_map = #map1} : memref, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}} %[[MASK1]] {in_bounds = [false, false, true, true], permutation_map = #[[$MAP0]]} : memref, vector<16x14x7x8xf32> // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32> // CHECK: %[[MASK3:.*]] = vector.splat %{{.*}} : vector<14x7xi1> @@ -307,12 +307,12 @@ func.func @transfer_read_permutations(%arg0 : memref, %arg1 : memref to vector<8x14x16x7xf32> // CHECK: vector.transpose %{{.*}}, [3, 1, 0, 2] : vector<8x14x16x7xf32> to vector<7x14x8x16xf32> - %3 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map3} : memref, vector<7x14x8x16xf32> + %3 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [false, false, true, true], permutation_map = #map3} : memref, vector<7x14x8x16xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF0]] : memref, vector<14x7xf32> // CHECK: vector.broadcast %{{.*}} : vector<14x7xf32> to vector<8x16x14x7xf32> // CHECK: vector.transpose %{{.*}}, [3, 2, 0, 1] : vector<8x16x14x7xf32> to vector<7x14x8x16xf32> - %4 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map4} : memref, vector<7x14x8x16xf32> + %4 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, false, true, false], permutation_map = #map4} : memref, vector<7x14x8x16xf32> // CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CF0]] : memref, vector<16x14xf32> // CHECK: vector.broadcast %{{.*}} : vector<16x14xf32> to vector<7x8x16x14xf32> // CHECK: vector.transpose %{{.*}}, [0, 3, 1, 2] : vector<7x8x16x14xf32> to vector<7x14x8x16xf32> @@ -321,7 +321,7 @@ func.func @transfer_read_permutations(%arg0 : memref, %arg1 : memref, vector<16x14x7x8xf32> // CHECK: vector.transpose %{{.*}}, [2, 1, 3, 0] : vector<16x14x7x8xf32> to vector<7x14x8x16xf32> - %6 = vector.transfer_read %arg0[%c0, %c0], %cst {permutation_map = #map6} : memref, vector<8xf32> + %6 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true], permutation_map = #map6} : memref, vector<8xf32> // CHECK: memref.load %{{.*}}[%[[C0]], %[[C0]]] : memref // CHECK: vector.broadcast %{{.*}} : f32 to vector<8xf32> diff --git a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir index 578d845a27ad43..eb0db736d5da58 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir @@ -199,7 +199,7 @@ func.func @transfer_read_unroll_permutation(%arg0 : memref<6x4xf32>) -> vector<4 func.func @transfer_read_unroll_broadcast(%arg0 : memref<6x4xf32>) -> vector<6x4xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32> + %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32> return %0 : vector<6x4xf32> } @@ -226,7 +226,7 @@ func.func @transfer_read_unroll_broadcast(%arg0 : memref<6x4xf32>) -> vector<6x4 func.func @transfer_read_unroll_broadcast_permuation(%arg0 : memref<6x4xf32>) -> vector<4x6xf32> { %c0 = arith.constant 0 : index %cf0 = arith.constant 0.0 : f32 - %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32> + %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32> return %0 : vector<4x6xf32> } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir index 8a98d39e657f2c..12b0511d486ea0 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir @@ -82,7 +82,7 @@ func.func @transfer_read_1d_broadcast( %A : memref, %base1 : index, %base2 : index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (0)>} + {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (0)>} : memref, vector<9xf32> vector.print %f: vector<9xf32> return diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir index cb8a8ce8ab0b0e..9f8849fa9a1489 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir @@ -57,7 +57,7 @@ func.func @transfer_read_2d_mask_broadcast( %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1, 0, 1]> : vector<9xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (0, d1)>} : + {in_bounds = [true, false], permutation_map = affine_map<(d0, d1) -> (0, d1)>} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return @@ -69,7 +69,7 @@ func.func @transfer_read_2d_mask_transpose_broadcast_last_dim( %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[1, 0, 1, 1]> : vector<4xi1> %f = vector.transfer_read %A[%base1, %base2], %fm42, %mask - {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : + {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return @@ -91,7 +91,7 @@ func.func @transfer_read_2d_broadcast( %A : memref, %base1: index, %base2: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%base1, %base2], %fm42 - {permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : + {in_bounds = [false, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref, vector<4x9xf32> vector.print %f: vector<4x9xf32> return diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir index 4aecca3d6891eb..466afeec459b43 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir @@ -32,7 +32,7 @@ func.func @transfer_read_3d_broadcast(%A : memref, %o: index, %a: index, %b: index, %c: index) { %fm42 = arith.constant -42.0: f32 %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42 - {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>} + {in_bounds = [false, true, false], permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)>} : memref, vector<2x5x3xf32> vector.print %f: vector<2x5x3xf32> return @@ -43,7 +43,7 @@ func.func @transfer_read_3d_mask_broadcast( %fm42 = arith.constant -42.0: f32 %mask = arith.constant dense<[0, 1]> : vector<2xi1> %f = vector.transfer_read %A[%o, %a, %b, %c], %fm42, %mask - {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>} + {in_bounds = [false, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>} : memref, vector<2x5x3xf32> vector.print %f: vector<2x5x3xf32> return diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index d96a63f26264e3..2ec781571818d6 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=%gpu_compilation_format" \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir index 405471b9af0043..3c8f3b1d0cbf4b 100644 --- a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir +++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \ -// RUN: | mlir-opt -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir index ae4309c848e714..d5e6e3757540b2 100644 --- a/mlir/test/Integration/GPU/ROCM/printf.mlir +++ b/mlir/test/Integration/GPU/ROCM/printf.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \ -// RUN: | mlir-opt -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir index 796ada5379ead0..d49d3957abbe96 100644 --- a/mlir/test/Integration/GPU/ROCM/two-modules.mlir +++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \ -// RUN: | mlir-opt -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir index ccbaf25530acee..986d8239427e3c 100644 --- a/mlir/test/Integration/GPU/ROCM/vecadd.mlir +++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir @@ -2,7 +2,7 @@ // RUN: | mlir-opt -convert-scf-to-cf \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \ -// RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir index b147a8f4d71d0d..575d967dcc9a23 100644 --- a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir +++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir @@ -2,7 +2,7 @@ // RUN: | mlir-opt -convert-scf-to-cf \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \ -// RUN: | mlir-opt -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Target/LLVMIR/Import/tune-cpu.ll b/mlir/test/Target/LLVMIR/Import/tune-cpu.ll new file mode 100644 index 00000000000000..991a70ada473c5 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/tune-cpu.ll @@ -0,0 +1,16 @@ +; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s + +; CHECK-LABEL: llvm.func @tune_cpu_x86() +; CHECK-SAME: tune_cpu = "pentium4" +define void @tune_cpu_x86() #0 { + ret void +} + +; CHECK-LABEL: llvm.func @tune_cpu_arm() +; CHECK-SAME: tune_cpu = "neoverse-n1" +define void @tune_cpu_arm() #1 { + ret void +} + +attributes #0 = { "tune-cpu"="pentium4" } +attributes #1 = { "tune-cpu"="neoverse-n1" } diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 1e533aeacfb49d..7878aa5ee46d4f 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -1069,8 +1069,8 @@ llvm.func @experimental_constrained_fptrunc(%s: f64, %v: vector<4xf32>) { // CHECK-DAG: declare void @llvm.debugtrap() // CHECK-DAG: declare void @llvm.ubsantrap(i8 immarg) // CHECK-DAG: declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) -// CHECK-DAG: declare void @llvm.memcpy.inline.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32 immarg, i1 immarg) -// CHECK-DAG: declare void @llvm.memcpy.inline.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64 immarg, i1 immarg) +// CHECK-DAG: declare void @llvm.memcpy.inline.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) +// CHECK-DAG: declare void @llvm.memcpy.inline.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) // CHECK-DAG: declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) // CHECK-DAG: declare { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) // CHECK-DAG: declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) diff --git a/mlir/test/Target/LLVMIR/tune-cpu.mlir b/mlir/test/Target/LLVMIR/tune-cpu.mlir new file mode 100644 index 00000000000000..c7969f5eb4db03 --- /dev/null +++ b/mlir/test/Target/LLVMIR/tune-cpu.mlir @@ -0,0 +1,14 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: define void @tune_cpu_x86() #[[ATTRSX86:.*]] { +// CHECK: define void @tune_cpu_arm() #[[ATTRSARM:.*]] { +// CHECK: attributes #[[ATTRSX86]] = { "tune-cpu"="pentium4" } +// CHECK: attributes #[[ATTRSARM]] = { "tune-cpu"="neoverse-n1" } + +llvm.func @tune_cpu_x86() attributes {tune_cpu = "pentium4"} { + llvm.return +} + +llvm.func @tune_cpu_arm() attributes {tune_cpu = "neoverse-n1"} { + llvm.return +} diff --git a/mlir/test/Transforms/test-block-legalization.mlir b/mlir/test/Transforms/test-block-legalization.mlir new file mode 100644 index 00000000000000..d739f95a569472 --- /dev/null +++ b/mlir/test/Transforms/test-block-legalization.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-opt %s -transform-interpreter | FileCheck %s + +// CHECK-LABEL: func @complex_block_signature_conversion( +// CHECK: %[[cst:.*]] = complex.constant +// CHECK: %[[complex_llvm:.*]] = builtin.unrealized_conversion_cast %[[cst]] : complex to !llvm.struct<(f64, f64)> +// Note: Some blocks are omitted. +// CHECK: llvm.br ^[[block1:.*]](%[[complex_llvm]] +// CHECK: ^[[block1]](%[[arg:.*]]: !llvm.struct<(f64, f64)>): +// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[arg]] : !llvm.struct<(f64, f64)> to complex +// CHECK: llvm.br ^[[block2:.*]] +// CHECK: ^[[block2]]: +// CHECK: "test.consumer_of_complex"(%[[cast]]) : (complex) -> () +func.func @complex_block_signature_conversion() { + %cst = complex.constant [0.000000e+00, 0.000000e+00] : complex + %true = arith.constant true + %0 = scf.if %true -> complex { + scf.yield %cst : complex + } else { + scf.yield %cst : complex + } + + // Regression test to ensure that the a source materialization is inserted. + // The operand of "test.consumer_of_complex" must not change. + "test.consumer_of_complex"(%0) : (complex) -> () + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %toplevel_module + : (!transform.any_op) -> !transform.any_op + transform.apply_conversion_patterns to %func { + transform.apply_conversion_patterns.dialect_to_llvm "cf" + transform.apply_conversion_patterns.func.func_to_llvm + transform.apply_conversion_patterns.scf.scf_to_control_flow + } with type_converter { + transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter + } { + legal_dialects = ["llvm"], + partial_conversion + } : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 1175f87877f9e8..98d0ddd9a2be11 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -106,6 +106,7 @@ def add_runtime(name): "mlir-capi-pass-test", "mlir-capi-pdl-test", "mlir-capi-quant-test", + "mlir-capi-rewrite-test", "mlir-capi-sparse-tensor-test", "mlir-capi-transform-test", "mlir-capi-transform-interpreter-test", diff --git a/mlir/test/mlir-tblgen/gen-dialect-doc.td b/mlir/test/mlir-tblgen/gen-dialect-doc.td index c9492eb9ac3cef..79d755111e8f67 100644 --- a/mlir/test/mlir-tblgen/gen-dialect-doc.td +++ b/mlir/test/mlir-tblgen/gen-dialect-doc.td @@ -3,6 +3,7 @@ include "mlir/IR/OpBase.td" include "mlir/IR/AttrTypeBase.td" +include "mlir/IR/EnumAttr.td" include "mlir/Interfaces/SideEffectInterfaces.td" def Test_Dialect : Dialect { @@ -69,6 +70,16 @@ def TestTypeDefParams : TypeDef { let assemblyFormat = "`<` $value `>`"; } +def TestEnum : + I32EnumAttr<"TestEnum", + "enum summary", [ + I32EnumAttrCase<"First", 0, "first">, + I32EnumAttrCase<"Second", 1, "second">, + I32EnumAttrCase<"Third", 2, "third">]> { + let genSpecializedAttr = 1; + let cppNamespace = "NS"; +} + // CHECK: Dialect without a [TOC] here. // CHECK: TOC added by tool. // CHECK: [TOC] @@ -109,6 +120,16 @@ def TestTypeDefParams : TypeDef { // CHECK: Syntax: // CHECK: !test.test_type_def_params +// CHECK: ## Enums +// CHECK: ### TestEnum +// CHECK: enum summary +// CHECK: #### Cases: +// CHECK: | Symbol | Value | String | +// CHECK: | :----: | :---: | ------ | +// CHECK: | First | `0` | first | +// CHECK: | Second | `1` | second | +// CHECK: | Third | `2` | third | + def Toc_Dialect : Dialect { let name = "test_toc"; let summary = "Dialect of ops to test"; diff --git a/mlir/test/python/dialects/vector.py b/mlir/test/python/dialects/vector.py index dafb2bfde8982d..77eaf94a830d96 100644 --- a/mlir/test/python/dialects/vector.py +++ b/mlir/test/python/dialects/vector.py @@ -51,10 +51,16 @@ def testTransferReadOp(): with InsertionPoint(f.add_entry_block()): A, zero, padding, mask = f.arguments vector.TransferReadOp( - vector_type, A, [zero, zero], identity_map_attr, padding, mask=mask + vector_type, + A, + [zero, zero], + identity_map_attr, + padding, + [False, False], + mask=mask, ) vector.TransferReadOp( - vector_type, A, [zero, zero], identity_map_attr, padding + vector_type, A, [zero, zero], identity_map_attr, padding, [False, False] ) func.ReturnOp([]) diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 7cd2690ea81557..71df80cd110f15 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -16,6 +16,7 @@ #include "OpGenHelpers.h" #include "mlir/Support/IndentedOstream.h" #include "mlir/TableGen/AttrOrTypeDef.h" +#include "mlir/TableGen/Attribute.h" #include "mlir/TableGen/GenInfo.h" #include "mlir/TableGen/Operator.h" #include "llvm/ADT/DenseMap.h" @@ -37,7 +38,7 @@ // Commandline Options //===----------------------------------------------------------------------===// static llvm::cl::OptionCategory - docCat("Options for -gen-(attrdef|typedef|op|dialect)-doc"); + docCat("Options for -gen-(attrdef|typedef|enum|op|dialect)-doc"); llvm::cl::opt stripPrefix("strip-prefix", llvm::cl::desc("Strip prefix of the fully qualified names"), @@ -228,8 +229,7 @@ static void emitOpDoc(const Operator &op, raw_ostream &os) { // Expandable description. // This appears as just the summary, but when clicked shows the full // description. - os << "

    " - << "" << it.attr.getSummary() << "" + os << "
    " << "" << it.attr.getSummary() << "" << "{{% markdown %}}" << description << "{{% /markdown %}}" << "
    "; } else { @@ -381,6 +381,39 @@ static void emitAttrOrTypeDefDoc(const RecordKeeper &recordKeeper, emitAttrOrTypeDefDoc(AttrOrTypeDef(def), os); } +//===----------------------------------------------------------------------===// +// Enum Documentation +//===----------------------------------------------------------------------===// + +static void emitEnumDoc(const EnumAttr &def, raw_ostream &os) { + os << llvm::formatv("### {0}\n", def.getEnumClassName()); + + // Emit the summary if present. + if (!def.getSummary().empty()) + os << "\n" << def.getSummary() << "\n"; + + // Emit case documentation. + std::vector cases = def.getAllCases(); + os << "\n#### Cases:\n\n"; + os << "| Symbol | Value | String |\n" + << "| :----: | :---: | ------ |\n"; + for (const auto &it : cases) { + os << "| " << it.getSymbol() << " | `" << it.getValue() << "` | " + << it.getStr() << " |\n"; + } + + os << "\n"; +} + +static void emitEnumDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { + std::vector defs = + recordKeeper.getAllDerivedDefinitions("EnumAttr"); + + os << "\n"; + for (const llvm::Record *def : defs) + emitEnumDoc(EnumAttr(def), os); +} + //===----------------------------------------------------------------------===// // Dialect Documentation //===----------------------------------------------------------------------===// @@ -413,7 +446,7 @@ static void maybeNest(bool nest, llvm::function_ref fn, static void emitBlock(ArrayRef attributes, StringRef inputFilename, ArrayRef attrDefs, ArrayRef ops, ArrayRef types, ArrayRef typeDefs, - raw_ostream &os) { + ArrayRef enums, raw_ostream &os) { if (!ops.empty()) { os << "## Operations\n\n"; emitSourceLink(inputFilename, os); @@ -459,13 +492,19 @@ static void emitBlock(ArrayRef attributes, StringRef inputFilename, for (const TypeDef &def : typeDefs) emitAttrOrTypeDefDoc(def, os); } + + if (!enums.empty()) { + os << "## Enums\n\n"; + for (const EnumAttr &def : enums) + emitEnumDoc(def, os); + } } static void emitDialectDoc(const Dialect &dialect, StringRef inputFilename, ArrayRef attributes, ArrayRef attrDefs, ArrayRef ops, ArrayRef types, ArrayRef typeDefs, - raw_ostream &os) { + ArrayRef enums, raw_ostream &os) { os << "# '" << dialect.getName() << "' Dialect\n\n"; emitIfNotEmpty(dialect.getSummary(), os); emitIfNotEmpty(dialect.getDescription(), os); @@ -475,7 +514,8 @@ static void emitDialectDoc(const Dialect &dialect, StringRef inputFilename, if (!r.match(dialect.getDescription())) os << "[TOC]\n\n"; - emitBlock(attributes, inputFilename, attrDefs, ops, types, typeDefs, os); + emitBlock(attributes, inputFilename, attrDefs, ops, types, typeDefs, enums, + os); } static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { @@ -495,21 +535,27 @@ static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { recordKeeper.getAllDerivedDefinitionsIfDefined("TypeDef"); std::vector attrDefDefs = recordKeeper.getAllDerivedDefinitionsIfDefined("AttrDef"); + std::vector enumDefs = + recordKeeper.getAllDerivedDefinitionsIfDefined("EnumAttrInfo"); std::vector dialectAttrs; std::vector dialectAttrDefs; std::vector dialectOps; std::vector dialectTypes; std::vector dialectTypeDefs; + std::vector dialectEnums; llvm::SmallDenseSet seen; - auto addIfInDialect = [&](llvm::Record *record, const auto &def, auto &vec) { - if (seen.insert(record).second && def.getDialect() == *dialect) { + auto addIfNotSeen = [&](llvm::Record *record, const auto &def, auto &vec) { + if (seen.insert(record).second) { vec.push_back(def); return true; } return false; }; + auto addIfInDialect = [&](llvm::Record *record, const auto &def, auto &vec) { + return def.getDialect() == *dialect && addIfNotSeen(record, def, vec); + }; SmallDenseMap opDocGroup; @@ -539,6 +585,9 @@ static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { addIfInDialect(def, TypeDef(def), dialectTypeDefs); for (Record *def : typeDefs) addIfInDialect(def, Type(def), dialectTypes); + dialectEnums.reserve(enumDefs.size()); + for (Record *def : enumDefs) + addIfNotSeen(def, EnumAttr(def), dialectEnums); // Sort alphabetically ignorning dialect for ops and section name for // sections. @@ -557,7 +606,7 @@ static bool emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { os << "\n"; emitDialectDoc(*dialect, recordKeeper.getInputFilename(), dialectAttrs, dialectAttrDefs, dialectOps, dialectTypes, dialectTypeDefs, - os); + dialectEnums, os); return false; } @@ -587,6 +636,13 @@ static mlir::GenRegistration return false; }); +static mlir::GenRegistration + genEnumRegister("gen-enum-doc", "Generate dialect enum documentation", + [](const RecordKeeper &records, raw_ostream &os) { + emitEnumDoc(records, os); + return false; + }); + static mlir::GenRegistration genRegister("gen-dialect-doc", "Generate dialect documentation", [](const RecordKeeper &records, raw_ostream &os) { diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp index a0affc4341b0b4..75c893334943d3 100644 --- a/mlir/unittests/IR/AffineExprTest.cpp +++ b/mlir/unittests/IR/AffineExprTest.cpp @@ -98,3 +98,11 @@ TEST(AffineExprTest, divisionSimplification) { ASSERT_EQ((d0 * 6).ceilDiv(4).getKind(), AffineExprKind::CeilDiv); ASSERT_EQ((d0 * 6).ceilDiv(-2), d0 * -3); } + +TEST(AffineExprTest, modSimplificationRegression) { + MLIRContext ctx; + OpBuilder b(&ctx); + auto d0 = b.getAffineDimExpr(0); + auto sum = d0 + d0.floorDiv(3).floorDiv(-3); + ASSERT_EQ(sum.getKind(), AffineExprKind::Add); +} diff --git a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c index 0b4d9d6ea9d46e..dd4d59257714be 100644 --- a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c +++ b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c @@ -2,6 +2,7 @@ // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ // RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa // REQUIRES: amdgcn-amd-amdhsa +// XFAIL: amdgcn-amd-amdhsa #include #include diff --git a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c index 656c3a20aaf82a..844c25dc9e0258 100644 --- a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c +++ b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c @@ -2,6 +2,7 @@ // RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \ // RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa // REQUIRES: amdgcn-amd-amdhsa +// XFAIL: amdgcn-amd-amdhsa #include "omp_dynamic_shared_memory_mixed.inc" // CHECK: PASS diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c index 35ecf55aa8c534..237e1585455c52 100644 --- a/offload/test/offloading/bug51781.c +++ b/offload/test/offloading/bug51781.c @@ -31,6 +31,7 @@ // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic // // CUSTOM: Rewriting generic-mode kernel with a customized state machine. +// XFAIL: amdgcn-amd-amdhsa #if ADD_REDUCTION #define REDUCTION(...) reduction(__VA_ARGS__) diff --git a/offload/test/offloading/bug51982.c b/offload/test/offloading/bug51982.c index 91ce4a264e2382..c037d215a8a793 100644 --- a/offload/test/offloading/bug51982.c +++ b/offload/test/offloading/bug51982.c @@ -1,6 +1,7 @@ // RUN: %libomptarget-compile-generic -O1 && %libomptarget-run-generic // -O1 to run openmp-opt // RUN: %libomptarget-compileopt-generic -O1 && %libomptarget-run-generic +// XFAIL: amdgcn-amd-amdhsa int main(void) { long int aa = 0; diff --git a/openmp/libompd/gdb-plugin/ompd/ompd.py b/openmp/libompd/gdb-plugin/ompd/ompd.py index a404e621e77bba..8355865408a4ea 100644 --- a/openmp/libompd/gdb-plugin/ompd/ompd.py +++ b/openmp/libompd/gdb-plugin/ompd/ompd.py @@ -50,7 +50,7 @@ def invoke(self, arg, from_tty): "No ompd_dll_locations symbol in execution, make sure to have an OMPD enabled OpenMP runtime" ) - while gdb.parse_and_eval("(char**)ompd_dll_locations") == False: + while not gdb.parse_and_eval("(char**)ompd_dll_locations"): gdb.execute("tbreak ompd_dll_locations_valid") gdb.execute("continue") diff --git a/openmp/libompd/gdb-plugin/ompd/ompd_callbacks.py b/openmp/libompd/gdb-plugin/ompd/ompd_callbacks.py index ada09d75579f0c..40eb3305f2e244 100644 --- a/openmp/libompd/gdb-plugin/ompd/ompd_callbacks.py +++ b/openmp/libompd/gdb-plugin/ompd/ompd_callbacks.py @@ -84,7 +84,7 @@ def _thread_context(*args): m = re.search(r"(0x[a-fA-F0-9]+)", line) elif lwp: m = re.search(r"\([^)]*?(\d+)[^)]*?\)", line) - if m == None: + if m is None: continue pid = int(m.group(1), 0) if pid == thread_id: diff --git a/openmp/runtime/src/include/omp-tools.h.var b/openmp/runtime/src/include/omp-tools.h.var index 8ee179dfe84d7e..471f46a9073ee7 100644 --- a/openmp/runtime/src/include/omp-tools.h.var +++ b/openmp/runtime/src/include/omp-tools.h.var @@ -78,6 +78,8 @@ /* implicit barrier at the end of worksharing */ \ macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \ macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \ + macro (ompt_state_wait_barrier_implementation, 0x015) /* implementation barrier */ \ + macro (ompt_state_wait_barrier_teams, 0x016) /* teams barrier */ \ \ /* task wait states (32..63) */ \ macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \ diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index b381694c0953e2..658cee594e48d5 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -1805,7 +1805,25 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, // It is OK to report the barrier state after the barrier begin callback. // According to the OMPT specification, a compliant implementation may // even delay reporting this state until the barrier begins to wait. - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + auto *ompt_thr_info = &this_thr->th.ompt_thread_info; + switch (barrier_kind) { + case ompt_sync_region_barrier_explicit: + ompt_thr_info->state = ompt_state_wait_barrier_explicit; + break; + case ompt_sync_region_barrier_implicit_workshare: + ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare; + break; + case ompt_sync_region_barrier_implicit_parallel: + ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel; + break; + case ompt_sync_region_barrier_teams: + ompt_thr_info->state = ompt_state_wait_barrier_teams; + break; + case ompt_sync_region_barrier_implementation: + [[fallthrough]]; + default: + ompt_thr_info->state = ompt_state_wait_barrier_implementation; + } } #endif @@ -2213,20 +2231,24 @@ void __kmp_join_barrier(int gtid) { codeptr = team->t.ompt_team_info.master_return_address; my_task_data = OMPT_CUR_TASK_DATA(this_thr); my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; + ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel; + if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) { + sync_kind = ompt_sync_region_barrier_teams; + ompt_state = ompt_state_wait_barrier_teams; + } if (ompt_enabled.ompt_callback_sync_region) { ompt_callbacks.ompt_callback(ompt_callback_sync_region)( - ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, - my_task_data, codeptr); + sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); } if (ompt_enabled.ompt_callback_sync_region_wait) { ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( - ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, - my_task_data, codeptr); + sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); } if (!KMP_MASTER_TID(ds_tid)) this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); #endif - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; + this_thr->th.ompt_thread_info.state = ompt_state; } #endif @@ -2488,8 +2510,10 @@ void __kmp_fork_barrier(int gtid, int tid) { } #if OMPT_SUPPORT + ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; if (ompt_enabled.enabled && - this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { + (ompt_state == ompt_state_wait_barrier_teams || + ompt_state == ompt_state_wait_barrier_implicit_parallel)) { int ds_tid = this_thr->th.th_info.ds.ds_tid; ompt_data_t *task_data = (team) ? OMPT_CUR_TASK_DATA(this_thr) @@ -2501,15 +2525,16 @@ void __kmp_fork_barrier(int gtid, int tid) { (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || ompt_callbacks.ompt_callback(ompt_callback_sync_region))) codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; + ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; + if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) + sync_kind = ompt_sync_region_barrier_teams; if (ompt_enabled.ompt_callback_sync_region_wait) { ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, - codeptr); + sync_kind, ompt_scope_end, NULL, task_data, codeptr); } if (ompt_enabled.ompt_callback_sync_region) { ompt_callbacks.ompt_callback(ompt_callback_sync_region)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, - codeptr); + sync_kind, ompt_scope_end, NULL, task_data, codeptr); } #endif if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index b49c44f348d6b4..5b4391aa125d41 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -7745,7 +7745,7 @@ int __kmp_invoke_task_func(int gtid) { ); #if OMPT_SUPPORT *exit_frame_p = NULL; - this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; + this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team; #endif #if KMP_STATS_ENABLED @@ -7843,7 +7843,7 @@ int __kmp_invoke_teams_master(int gtid) { #endif __kmp_teams_master(gtid); #if OMPT_SUPPORT - this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; + this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league; #endif __kmp_run_after_invoked_task(gtid, 0, this_thr, team); return 1; @@ -8126,8 +8126,10 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { __kmp_join_barrier(gtid); /* wait for everyone */ #if OMPT_SUPPORT + ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; if (ompt_enabled.enabled && - this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { + (ompt_state == ompt_state_wait_barrier_teams || + ompt_state == ompt_state_wait_barrier_implicit_parallel)) { int ds_tid = this_thr->th.th_info.ds.ds_tid; ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); this_thr->th.ompt_thread_info.state = ompt_state_overhead; @@ -8138,15 +8140,16 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { ompt_callbacks.ompt_callback(ompt_callback_sync_region))) codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; + ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; + if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) + sync_kind = ompt_sync_region_barrier_teams; if (ompt_enabled.ompt_callback_sync_region_wait) { ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, - codeptr); + sync_kind, ompt_scope_end, NULL, task_data, codeptr); } if (ompt_enabled.ompt_callback_sync_region) { ompt_callbacks.ompt_callback(ompt_callback_sync_region)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, - codeptr); + sync_kind, ompt_scope_end, NULL, task_data, codeptr); } #endif if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h index 12d5d0677a90a2..97db68943da702 100644 --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -323,19 +323,21 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr, ompt_state_t ompt_state, ompt_data_t *tId) { int ds_tid = this_thr->th.th_info.ds.ds_tid; - if (ompt_state == ompt_state_wait_barrier_implicit) { + if (ompt_state == ompt_state_wait_barrier_implicit_parallel || + ompt_state == ompt_state_wait_barrier_teams) { this_thr->th.ompt_thread_info.state = ompt_state_overhead; #if OMPT_OPTIONAL void *codeptr = NULL; + ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; + if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) + sync_kind = ompt_sync_region_barrier_teams; if (ompt_enabled.ompt_callback_sync_region_wait) { ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId, - codeptr); + sync_kind, ompt_scope_end, NULL, tId, codeptr); } if (ompt_enabled.ompt_callback_sync_region) { ompt_callbacks.ompt_callback(ompt_callback_sync_region)( - ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId, - codeptr); + sync_kind, ompt_scope_end, NULL, tId, codeptr); } #endif if (!KMP_MASTER_TID(ds_tid)) { @@ -455,7 +457,9 @@ final_spin=FALSE) ompt_data_t *tId; if (ompt_enabled.enabled) { ompt_entry_state = this_thr->th.ompt_thread_info.state; - if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit || + if (!final_spin || + (ompt_entry_state != ompt_state_wait_barrier_implicit_parallel && + ompt_entry_state != ompt_state_wait_barrier_teams) || KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) { ompt_lw_taskteam_t *team = NULL; if (this_thr->th.th_team) diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp index 16acbe052d12e6..0737c0cdfb1602 100644 --- a/openmp/runtime/src/ompt-specific.cpp +++ b/openmp/runtime/src/ompt-specific.cpp @@ -503,22 +503,23 @@ static uint64_t __ompt_get_unique_id_internal() { ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt, kmp_info_t *thr) { - if (bt == bs_forkjoin_barrier) - return ompt_sync_region_barrier_implicit; + if (bt == bs_forkjoin_barrier) { + if (thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) + return ompt_sync_region_barrier_teams; + else + return ompt_sync_region_barrier_implicit_parallel; + } - if (bt != bs_plain_barrier) + if (bt != bs_plain_barrier || !thr->th.th_ident) return ompt_sync_region_barrier_implementation; - if (!thr->th.th_ident) - return ompt_sync_region_barrier; - kmp_int32 flags = thr->th.th_ident->flags; if ((flags & KMP_IDENT_BARRIER_EXPL) != 0) return ompt_sync_region_barrier_explicit; if ((flags & KMP_IDENT_BARRIER_IMPL) != 0) - return ompt_sync_region_barrier_implicit; + return ompt_sync_region_barrier_implicit_workshare; return ompt_sync_region_barrier_implementation; } diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h index c1093141e9126c..07d38cf836dff0 100644 --- a/openmp/runtime/test/ompt/callback.h +++ b/openmp/runtime/test/ompt/callback.h @@ -93,6 +93,18 @@ static const char *ompt_dependence_type_t_values[36] = { "ompt_dependence_type_inout_all_memory" // 35 }; +static const char *ompt_sync_region_t_values[] = {"undefined", + "barrier", + "barrier_implicit", + "barrier_explicit", + "barrier_implementation", + "taskwait", + "taskgroup", + "reduction", + "barrier_implicit_workshare", + "barrier_implicit_parallel", + "barrier_teams"}; + static void format_task_type(int type, char *buffer) { char *progress = buffer; if (type & ompt_task_initial) @@ -506,89 +518,32 @@ on_ompt_callback_sync_region( ompt_data_t *task_data, const void *codeptr_ra) { - switch(endpoint) - { - case ompt_scope_begin: - switch(kind) - { - case ompt_sync_region_barrier: - case ompt_sync_region_barrier_implicit: - case ompt_sync_region_barrier_implicit_workshare: - case ompt_sync_region_barrier_implicit_parallel: - case ompt_sync_region_barrier_teams: - case ompt_sync_region_barrier_explicit: - case ompt_sync_region_barrier_implementation: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_barrier_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - print_ids(0); - break; - case ompt_sync_region_taskwait: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_taskwait_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_sync_region_taskgroup: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_taskgroup_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_sync_region_reduction: - printf("ompt_sync_region_reduction should never be passed to " - "on_ompt_callback_sync_region\n"); - exit(-1); - break; - } - break; - case ompt_scope_end: - switch(kind) - { - case ompt_sync_region_barrier: - case ompt_sync_region_barrier_implicit: - case ompt_sync_region_barrier_explicit: - case ompt_sync_region_barrier_implicit_workshare: - case ompt_sync_region_barrier_implicit_parallel: - case ompt_sync_region_barrier_teams: - case ompt_sync_region_barrier_implementation: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_barrier_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_taskwait: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_taskwait_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_taskgroup: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_taskgroup_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_reduction: - printf("ompt_sync_region_reduction should never be passed to " - "on_ompt_callback_sync_region\n"); - exit(-1); - break; - } - break; - case ompt_scope_beginend: - printf("ompt_scope_beginend should never be passed to %s\n", __func__); - exit(-1); + if (endpoint == ompt_scope_beginend) { + printf("ompt_scope_beginend should never be passed to %s\n", __func__); + exit(-1); + } + if (kind == ompt_sync_region_reduction) { + printf("ompt_sync_region_reduction should never be passed to %s\n", + __func__); + exit(-1); + } + uint64_t parallel_data_value = parallel_data ? parallel_data->value : 0; + const char *begin_or_end = (endpoint == ompt_scope_begin) ? "begin" : "end"; + printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_%s_%s: parallel_id=%" PRIu64 + ", task_id=%" PRIu64 ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, ompt_sync_region_t_values[kind], + begin_or_end, parallel_data_value, task_data->value, codeptr_ra); + switch (kind) { + case ompt_sync_region_barrier: + case ompt_sync_region_barrier_implicit: + case ompt_sync_region_barrier_implicit_workshare: + case ompt_sync_region_barrier_implicit_parallel: + case ompt_sync_region_barrier_teams: + case ompt_sync_region_barrier_explicit: + case ompt_sync_region_barrier_implementation: + if (endpoint == ompt_scope_begin) + print_ids(0); + default:; } } @@ -600,89 +555,22 @@ on_ompt_callback_sync_region_wait( ompt_data_t *task_data, const void *codeptr_ra) { - switch(endpoint) - { - case ompt_scope_begin: - switch(kind) - { - case ompt_sync_region_barrier: - case ompt_sync_region_barrier_implicit: - case ompt_sync_region_barrier_implicit_workshare: - case ompt_sync_region_barrier_implicit_parallel: - case ompt_sync_region_barrier_teams: - case ompt_sync_region_barrier_explicit: - case ompt_sync_region_barrier_implementation: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_sync_region_taskwait: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_sync_region_taskgroup: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_sync_region_reduction: - printf("ompt_sync_region_reduction should never be passed to " - "on_ompt_callback_sync_region_wait\n"); - exit(-1); - break; - } - break; - case ompt_scope_end: - switch(kind) - { - case ompt_sync_region_barrier: - case ompt_sync_region_barrier_implicit: - case ompt_sync_region_barrier_implicit_workshare: - case ompt_sync_region_barrier_implicit_parallel: - case ompt_sync_region_barrier_teams: - case ompt_sync_region_barrier_explicit: - case ompt_sync_region_barrier_implementation: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_barrier_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_taskwait: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_taskwait_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_taskgroup: - printf("%" PRIu64 ":" _TOOL_PREFIX - " ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, - (parallel_data) ? parallel_data->value : 0, task_data->value, - codeptr_ra); - break; - case ompt_sync_region_reduction: - printf("ompt_sync_region_reduction should never be passed to " - "on_ompt_callback_sync_region_wait\n"); - exit(-1); - break; - } - break; - case ompt_scope_beginend: - printf("ompt_scope_beginend should never be passed to %s\n", __func__); - exit(-1); + if (endpoint == ompt_scope_beginend) { + printf("ompt_scope_beginend should never be passed to %s\n", __func__); + exit(-1); + } + if (kind == ompt_sync_region_reduction) { + printf("ompt_sync_region_reduction should never be passed to %s\n", + __func__); + exit(-1); } + uint64_t parallel_data_value = parallel_data ? parallel_data->value : 0; + const char *begin_or_end = (endpoint == ompt_scope_begin) ? "begin" : "end"; + printf("%" PRIu64 ":" _TOOL_PREFIX + " ompt_event_wait_%s_%s: parallel_id=%" PRIu64 ", task_id=%" PRIu64 + ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, ompt_sync_region_t_values[kind], + begin_or_end, parallel_data_value, task_data->value, codeptr_ra); } static void on_ompt_callback_reduction(ompt_sync_region_t kind, diff --git a/openmp/runtime/test/ompt/parallel/nested.c b/openmp/runtime/test/ompt/parallel/nested.c index d91597b66da77d..e804eb6fea3cc1 100644 --- a/openmp/runtime/test/ompt/parallel/nested.c +++ b/openmp/runtime/test/ompt/parallel/nested.c @@ -41,7 +41,7 @@ int main() } print_fuzzy_address(3); - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' @@ -66,13 +66,13 @@ int main() // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] @@ -97,24 +97,24 @@ int main() // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end // explicit barrier - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=0x{{[0-f]+}} - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]] // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]] // implicit barrier - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // implicit barrier - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] @@ -128,13 +128,13 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -146,13 +146,13 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -164,13 +164,13 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // nested parallel worker threads @@ -180,8 +180,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -190,8 +190,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -200,8 +200,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -210,8 +210,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -220,8 +220,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -230,8 +230,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -240,8 +240,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -250,8 +250,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -260,8 +260,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -270,8 +270,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -280,8 +280,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -290,9 +290,10 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/parallel/nested_lwt.c b/openmp/runtime/test/ompt/parallel/nested_lwt.c index 83483767bda1fd..ee74bcac2c5926 100644 --- a/openmp/runtime/test/ompt/parallel/nested_lwt.c +++ b/openmp/runtime/test/ompt/parallel/nested_lwt.c @@ -39,6 +39,7 @@ int main() } print_fuzzy_address(3); + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' @@ -61,13 +62,13 @@ int main() // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] @@ -93,8 +94,8 @@ int main() // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]] // THREADS: {{^}}[[MASTER_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]] @@ -103,8 +104,8 @@ int main() // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] @@ -124,8 +125,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]] // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]] @@ -134,8 +135,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -153,8 +154,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]] // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]] @@ -163,8 +164,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -182,8 +183,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]] // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]] @@ -192,8 +193,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // nested parallel worker threads @@ -204,8 +205,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -215,8 +216,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -226,8 +227,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -237,8 +238,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -248,8 +249,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -259,8 +260,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -270,8 +271,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -281,8 +282,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -292,8 +293,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -303,8 +304,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -314,8 +315,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -325,10 +326,10 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}} // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] - + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/parallel/nested_serialized.c b/openmp/runtime/test/ompt/parallel/nested_serialized.c index f87b8f48aa5241..741fdfd3679df7 100644 --- a/openmp/runtime/test/ompt/parallel/nested_serialized.c +++ b/openmp/runtime/test/ompt/parallel/nested_serialized.c @@ -23,6 +23,7 @@ int main() } print_fuzzy_address(2); + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' @@ -46,13 +47,13 @@ int main() // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] @@ -72,8 +73,8 @@ int main() // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] @@ -90,8 +91,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -105,8 +106,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -120,9 +121,10 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/parallel/nested_thread_num.c b/openmp/runtime/test/ompt/parallel/nested_thread_num.c index f14f87ab9a57e7..7ba7077e6f032e 100644 --- a/openmp/runtime/test/ompt/parallel/nested_thread_num.c +++ b/openmp/runtime/test/ompt/parallel/nested_thread_num.c @@ -68,8 +68,7 @@ int main() { // barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: -// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: - +// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: // CHECK: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], // CHECK-SAME: task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] @@ -141,8 +140,8 @@ int main() { // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end // explicit barrier -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_begin: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]], // THREADS-SAME: codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} @@ -151,8 +150,8 @@ int main() { // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]], // THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=0x{{[0-f]+}} -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_end: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]] @@ -163,8 +162,8 @@ int main() { // THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]] // implicit barrier -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]], // THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} @@ -173,7 +172,7 @@ int main() { // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]], // THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]] -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], // THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}} @@ -195,7 +194,7 @@ int main() { // THREADS-SAME: reenter_frame=[[NULL]] // implicit barrier -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} @@ -203,7 +202,7 @@ int main() { // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], // THREADS-SAME: reenter_frame=[[NULL]] -// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} @@ -260,11 +259,11 @@ int main() { // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]] -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: @@ -276,10 +275,10 @@ int main() { // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: @@ -310,11 +309,11 @@ int main() { // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]] -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: @@ -345,11 +344,11 @@ int main() { // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: -// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: +// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]] -// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: +// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: // THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: diff --git a/openmp/runtime/test/ompt/parallel/no_thread_num_clause.c b/openmp/runtime/test/ompt/parallel/no_thread_num_clause.c index 5583036ec9229e..41e16428bf71b7 100644 --- a/openmp/runtime/test/ompt/parallel/no_thread_num_clause.c +++ b/openmp/runtime/test/ompt/parallel/no_thread_num_clause.c @@ -13,6 +13,7 @@ int main() } print_fuzzy_address(1); + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end' @@ -38,13 +39,13 @@ int main() // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]] @@ -59,8 +60,8 @@ int main() // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] @@ -70,8 +71,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]] @@ -79,8 +80,8 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]] @@ -88,9 +89,10 @@ int main() // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/parallel/normal.c b/openmp/runtime/test/ompt/parallel/normal.c index 011bfacb130f35..85518f40c628bd 100644 --- a/openmp/runtime/test/ompt/parallel/normal.c +++ b/openmp/runtime/test/ompt/parallel/normal.c @@ -32,13 +32,13 @@ int main() { // barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end @@ -61,13 +61,13 @@ int main() { // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]] // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // THREADS: {{^}}[[MASTER_ID]]: task level 0 // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]] - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end // parallel_id is 0 because the region ended in the barrier! // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} @@ -85,9 +85,9 @@ int main() { // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]] // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end // parallel_id is 0 because the region ended in the barrier! // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end @@ -104,9 +104,9 @@ int main() { // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]] // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end // parallel_id is 0 because the region ended in the barrier! // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end @@ -123,9 +123,9 @@ int main() { // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]] // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end // parallel_id is 0 because the region ended in the barrier! // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end diff --git a/openmp/runtime/test/ompt/parallel/not_enough_threads.c b/openmp/runtime/test/ompt/parallel/not_enough_threads.c index 8a0469af1912c8..39c80e425471e6 100644 --- a/openmp/runtime/test/ompt/parallel/not_enough_threads.c +++ b/openmp/runtime/test/ompt/parallel/not_enough_threads.c @@ -32,7 +32,7 @@ int main() { // barrier_end and implicit_task_end before parallel_end! // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin - // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end @@ -56,10 +56,10 @@ int main() { // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // parallel_id is 0 because the region ended in the barrier! - // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] @@ -78,10 +78,10 @@ int main() { // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]] // THREADS-SAME: task_id=[[PARENT_TASK_ID]] // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // parallel_id is 0 because the region ended in the barrier! - // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end + // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] diff --git a/openmp/runtime/test/ompt/synchronization/barrier/explicit.c b/openmp/runtime/test/ompt/synchronization/barrier/explicit.c index d60acd62311ea0..7c47d935534d74 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/explicit.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/explicit.c @@ -20,39 +20,40 @@ int main() x++; } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' // CHECK: 0: NULL_POINTER=[[NULL:.*$]] - // master thread explicit barrier - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // master thread explicit barrier + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_explicit_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_explicit_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_explicit_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] // CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]] // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}} - // worker thread explicit barrier - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // worker thread explicit barrier + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_explicit_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_explicit_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_explicit_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_explicit_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]] // CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]] // worker thread implicit barrier at parallel end - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/synchronization/barrier/for_loop.c b/openmp/runtime/test/ompt/synchronization/barrier/for_loop.c index 52594478e1f04e..8181aa64d1a919 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/for_loop.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/for_loop.c @@ -20,37 +20,37 @@ int main() print_current_address(); } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' // CHECK: 0: NULL_POINTER=[[NULL:.*$]] - // master thread implicit barrier at loop end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // master thread implicit barrier at loop end + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // worker thread explicit barrier - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // worker thread implicit barrier at loop end + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // worker thread implicit barrier after parallel - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/synchronization/barrier/for_simd.c b/openmp/runtime/test/ompt/synchronization/barrier/for_simd.c index 351b2c23b8cc8f..11df36c265e2a9 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/for_simd.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/for_simd.c @@ -16,18 +16,19 @@ int main() y[i]++; } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' // CHECK: 0: NULL_POINTER=[[NULL:.*$]] - // master thread implicit barrier at simd loop end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // master thread implicit barrier at simd loop end + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/synchronization/barrier/implicit_task_data.c b/openmp/runtime/test/ompt/synchronization/barrier/implicit_task_data.c index 7ac3e9099c8ee2..da7c75627569fe 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/implicit_task_data.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/implicit_task_data.c @@ -34,7 +34,7 @@ int main() } } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' @@ -42,17 +42,18 @@ int main() // CHECK: 0: NULL_POINTER=[[NULL:.*$]] // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}} // worker thread implicit barrier at parallel end - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]] + // clang-format on return 0; } @@ -76,21 +77,26 @@ on_ompt_callback_sync_region( ompt_data_t *task_data, const void *codeptr_ra) { - switch(endpoint) - { - case ompt_scope_begin: - task_data->value = ompt_get_unique_id(); - if (kind == ompt_sync_region_barrier_implicit) - printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); - break; - case ompt_scope_end: - if (kind == ompt_sync_region_barrier_implicit) - printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); - break; - case ompt_scope_beginend: - printf("ompt_scope_beginend should never be passed to %s\n", __func__); - exit(-1); + // We only expect implicit parallel barrier in this code. + if (kind != ompt_sync_region_barrier_implicit_parallel) { + printf("unexpected ompt_sync_region_t passed to %s\n", __func__); + exit(-1); + } + const char *event_name = NULL; + if (endpoint == ompt_scope_begin) { + event_name = "ompt_event_barrier_implicit_parallel_begin"; + task_data->value = ompt_get_unique_id(); + } else if (endpoint == ompt_scope_end) { + event_name = "ompt_event_barrier_implicit_parallel_end"; + } else { + printf("ompt_scope_beginend should never be passed to %s\n", __func__); + exit(-1); } + printf("%" PRIu64 ": %s: parallel_id=%" PRIu64 ", task_id=%" PRIu64 + ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, event_name, + parallel_data ? parallel_data->value : 0, task_data->value, + codeptr_ra); } static void @@ -101,24 +107,24 @@ on_ompt_callback_sync_region_wait( ompt_data_t *task_data, const void *codeptr_ra) { - switch(endpoint) - { - case ompt_scope_begin: - if (kind == ompt_sync_region_barrier_implicit) - printf("%" PRIu64 - ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 - ", task_id=%" PRIu64 ", codeptr_ra=%p\n", - ompt_get_thread_data()->value, parallel_data->value, - task_data->value, codeptr_ra); - break; - case ompt_scope_end: - if (kind == ompt_sync_region_barrier_implicit) - printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); - break; - case ompt_scope_beginend: - printf("ompt_scope_beginend should never be passed to %s\n", __func__); - exit(-1); + if (kind != ompt_sync_region_barrier_implicit_parallel) { + printf("unexpected ompt_sync_region_t passed to %s\n", __func__); + exit(-1); + } + const char *event_name = NULL; + if (endpoint == ompt_scope_begin) { + event_name = "ompt_event_wait_barrier_implicit_parallel_begin"; + } else if (endpoint == ompt_scope_end) { + event_name = "ompt_event_wait_barrier_implicit_parallel_end"; + } else { + printf("ompt_scope_beginend should never be passed to %s\n", __func__); + exit(-1); } + printf("%" PRIu64 ": %s: parallel_id=%" PRIu64 ", task_id=%" PRIu64 + ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, event_name, + parallel_data ? parallel_data->value : 0, task_data->value, + codeptr_ra); } #define register_ompt_callback_t(name, type) \ diff --git a/openmp/runtime/test/ompt/synchronization/barrier/parallel_region.c b/openmp/runtime/test/ompt/synchronization/barrier/parallel_region.c index ea0a23f1ed5860..4cc6ad553e13dd 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/parallel_region.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/parallel_region.c @@ -15,7 +15,7 @@ int main() } print_fuzzy_address(); - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' @@ -23,18 +23,19 @@ int main() // CHECK: 0: NULL_POINTER=[[NULL:.*$]] // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] // worker thread implicit barrier at parallel end - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/synchronization/barrier/sections.c b/openmp/runtime/test/ompt/synchronization/barrier/sections.c index 4e1dfdd301c0a4..fcae6023d4aeeb 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/sections.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/sections.c @@ -27,7 +27,7 @@ int main() } } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' @@ -35,29 +35,30 @@ int main() // CHECK: 0: NULL_POINTER=[[NULL:.*$]] // master thread implicit barrier at sections end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // worker thread implicit barrier at sections end - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // worker thread implicit barrier at parallel end - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/synchronization/barrier/single.c b/openmp/runtime/test/ompt/synchronization/barrier/single.c index 8ba8b5211b1f24..fa1b21f5504362 100644 --- a/openmp/runtime/test/ompt/synchronization/barrier/single.c +++ b/openmp/runtime/test/ompt/synchronization/barrier/single.c @@ -23,7 +23,7 @@ int main() } } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait' @@ -31,31 +31,32 @@ int main() // CHECK: 0: NULL_POINTER=[[NULL:.*$]] // master thread implicit barrier at single end - // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] // master thread implicit barrier at parallel end - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}} // worker thread implicit barrier at single end - // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_workshare_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}} // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] // worker thread implicit barrier at parallel end - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]] + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/tasks/explicit_task.c b/openmp/runtime/test/ompt/tasks/explicit_task.c index a986c48ee895ae..8107a8f6f9b284 100644 --- a/openmp/runtime/test/ompt/tasks/explicit_task.c +++ b/openmp/runtime/test/ompt/tasks/explicit_task.c @@ -35,7 +35,7 @@ int main() print_ids(0); } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule' @@ -66,13 +66,13 @@ int main() // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // explicit barrier after master - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // implicit barrier parallel - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -80,7 +80,7 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_explicit_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}} // this is expected to come earlier and at MASTER: // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]] @@ -90,13 +90,12 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_explicit_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] - - + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/tasks/serialized.c b/openmp/runtime/test/ompt/tasks/serialized.c index 1ce0b17a395ce0..7785d793f55847 100644 --- a/openmp/runtime/test/ompt/tasks/serialized.c +++ b/openmp/runtime/test/ompt/tasks/serialized.c @@ -114,12 +114,12 @@ int main() { // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]] // implicit barrier parallel - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0 // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end // parallel_id is 0 because the region ended in the barrier! // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end @@ -139,13 +139,13 @@ int main() { // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)={{0x[0-f]+}} - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0 // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]] // parallel_id is 0 because the region ended in the barrier! - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end diff --git a/openmp/runtime/test/ompt/tasks/task_in_joinbarrier.c b/openmp/runtime/test/ompt/tasks/task_in_joinbarrier.c index 8228add78f3eb2..bf148ef229ff78 100644 --- a/openmp/runtime/test/ompt/tasks/task_in_joinbarrier.c +++ b/openmp/runtime/test/ompt/tasks/task_in_joinbarrier.c @@ -32,7 +32,7 @@ int main() print_ids(0); } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule' @@ -62,9 +62,9 @@ int main() // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // implicit barrier parallel - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -73,7 +73,7 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]] // implicit barrier parallel - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]] @@ -82,10 +82,9 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] - - + // clang-format on return 0; } diff --git a/openmp/runtime/test/ompt/tasks/untied_task.c b/openmp/runtime/test/ompt/tasks/untied_task.c index 4ee3f110bf6e83..d908a354ca4116 100644 --- a/openmp/runtime/test/ompt/tasks/untied_task.c +++ b/openmp/runtime/test/ompt/tasks/untied_task.c @@ -42,7 +42,7 @@ int main() print_ids(0); } - + // clang-format off // Check if libomp supports the callbacks for this test. // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create' // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule' @@ -72,13 +72,13 @@ int main() // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // explicit barrier after master - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}} - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_explicit_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // implicit barrier parallel - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]] @@ -86,7 +86,7 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]] // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_explicit_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}} // this is expected to come earlier and at MASTER: // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]] @@ -96,13 +96,12 @@ int main() // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}} // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_explicit_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]] - // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] + // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_implicit_parallel_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]] - - + // clang-format on return 0; } diff --git a/openmp/tools/archer/tests/lit.cfg b/openmp/tools/archer/tests/lit.cfg index 692cbfe97cf1e1..f8fbcad752a4c6 100644 --- a/openmp/tools/archer/tests/lit.cfg +++ b/openmp/tools/archer/tests/lit.cfg @@ -83,7 +83,7 @@ if config.operating_system == 'Darwin': if 'Linux' in config.operating_system: config.available_features.add("linux") -if config.has_tsan == True: +if config.has_tsan: config.available_features.add("tsan") # to run with icc INTEL_LICENSE_FILE must be set diff --git a/openmp/tools/multiplex/tests/custom_data_storage/custom_data_storage.c b/openmp/tools/multiplex/tests/custom_data_storage/custom_data_storage.c index 96d9a758429f3d..1a260f5bcd3dfe 100644 --- a/openmp/tools/multiplex/tests/custom_data_storage/custom_data_storage.c +++ b/openmp/tools/multiplex/tests/custom_data_storage/custom_data_storage.c @@ -82,12 +82,14 @@ int main() { // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} @@ -125,11 +127,13 @@ int main() { // CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_end: // CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]] -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) @@ -184,12 +188,14 @@ int main() { // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} @@ -227,11 +233,13 @@ int main() { // CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_end: // CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]] -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) @@ -257,19 +265,23 @@ int main() { // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, // CHECK-SAME: thread_num=1 -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) @@ -290,19 +302,23 @@ int main() { // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, // CHECK-SAME: thread_num=1 -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) diff --git a/openmp/tools/multiplex/tests/print/print.c b/openmp/tools/multiplex/tests/print/print.c index c492899c69f8ca..a3ec1f5a73a509 100644 --- a/openmp/tools/multiplex/tests/print/print.c +++ b/openmp/tools/multiplex/tests/print/print.c @@ -83,12 +83,14 @@ int main() { // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} @@ -124,11 +126,13 @@ int main() { // CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_task_end: // CHECK-SAME: task_id=[[_FIRST_EXPLICIT_TASK_ID]] -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_1ST_MSTR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[_FIRST_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) @@ -181,12 +185,14 @@ int main() { // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra={{0x[0-f]+}} @@ -222,11 +228,13 @@ int main() { // CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_task_end: // CHECK-SAME: task_id=[[SECOND_EXPLICIT_TASK_ID]] -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_2ND_MSTR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[SECOND_MASTER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) @@ -250,19 +258,23 @@ int main() { // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, // CHECK-SAME: thread_num=1 -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[_FIRST_PARALLEL_ID]], // CHECK-SAME: task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) -// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_1ST_WRKR_TID]]: _first_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[_FIRST_WORKER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) @@ -282,19 +294,23 @@ int main() { // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID:[0-9]+]], // CHECK-SAME: team_size=2, thread_num=1 -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_begin: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_begin: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_begin: // CHECK-SAME: parallel_id=[[SECOND_PARALLEL_ID]], // CHECK-SAME: task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_wait_barrier_end: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_wait_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) -// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: ompt_event_barrier_end: +// CHECK: {{^}}[[_2ND_WRKR_TID]]: second_tool: +// CHECK-SAME: ompt_event_barrier_implicit_parallel_end: // CHECK-SAME: parallel_id=0, task_id=[[SECOND_WORKER_IMPLICIT_TASK_ID]], // CHECK-SAME: codeptr_ra=(nil) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 919eea61757078..1687321b521e68 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -254,6 +254,7 @@ libc_support_library( hdrs = ["src/__support/macros/optimization.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ":__support_macros_properties_compiler", ], ) @@ -261,6 +262,9 @@ libc_support_library( libc_support_library( name = "__support_macros_sanitizer", hdrs = ["src/__support/macros/sanitizer.h"], + deps = [ + ":__support_macros_config", + ], ) libc_support_library( @@ -271,6 +275,7 @@ libc_support_library( ], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ":__support_macros_properties_architectures", ], ) @@ -280,6 +285,7 @@ libc_support_library( hdrs = ["src/__support/CPP/algorithm.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ], ) @@ -317,6 +323,7 @@ libc_support_library( hdrs = ["src/__support/CPP/bitset.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ], ) @@ -334,6 +341,7 @@ libc_support_library( hdrs = ["src/__support/CPP/expected.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ], ) @@ -424,6 +432,7 @@ libc_support_library( ], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ":__support_macros_properties_types", ":llvm_libc_macros_stdfix_macros", ], @@ -573,7 +582,10 @@ libc_support_library( libc_support_library( name = "__support_str_to_num_result", hdrs = ["src/__support/str_to_num_result.h"], - deps = [":__support_macros_attributes"], + deps = [ + ":__support_macros_attributes", + ":__support_macros_config", + ], ) libc_support_library( @@ -612,7 +624,10 @@ libc_support_library( libc_support_library( name = "__support_ctype_utils", hdrs = ["src/__support/ctype_utils.h"], - deps = [":__support_macros_attributes"], + deps = [ + ":__support_macros_attributes", + ":__support_macros_config", + ], ) libc_support_library( @@ -785,6 +800,7 @@ libc_support_library( hdrs = ["src/__support/FPUtil/rounding_mode.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ":hdr_fenv_macros", ], ) @@ -1126,6 +1142,7 @@ libc_support_library( hdrs = ["src/__support/threads/sleep.h"], deps = [ ":__support_macros_attributes", + ":__support_macros_config", ], ) @@ -2396,9 +2413,15 @@ libc_function( libc_support_library( name = "qsort_util", - hdrs = ["src/stdlib/qsort_util.h"], + hdrs = [ + "src/stdlib/heap_sort.h", + "src/stdlib/qsort_data.h", + "src/stdlib/qsort_util.h", + "src/stdlib/quick_sort.h", + ], deps = [ ":__support_common", + ":__support_cpp_cstddef", ":__support_macros_attributes", ], ) @@ -3408,9 +3431,9 @@ libc_support_library( ":__support_arg_list", ":__support_file_file", ":__support_macros_attributes", - ":types_FILE", ":printf_main", ":printf_writer", + ":types_FILE", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl index cc732effb243e1..ec3714407cb914 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl @@ -43,7 +43,7 @@ def _libc_library(name, hidden, copts = [], deps = [], local_defines = [], **kwa name = name, copts = copts + libc_common_copts(), local_defines = local_defines + LIBC_CONFIGURE_OPTIONS, - deps = deps + ["//libc:__support_macros_config"], + deps = deps, linkstatic = 1, **kwargs ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel index 6126a4a8fca830..3e130cd9dc6f08 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel @@ -18,6 +18,7 @@ libc_support_library( "//libc:__support_big_int", "//libc:__support_cpp_string", "//libc:__support_cpp_string_view", + "//libc:__support_macros_config", "//libc:__support_macros_properties_types", "//libc:__support_osutil_io", "//libc:__support_uint128", @@ -52,6 +53,7 @@ libc_support_library( "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_fpbits_str", "//libc:__support_fputil_rounding_mode", + "//libc:__support_macros_config", "//libc:__support_macros_properties_architectures", "//libc:__support_macros_properties_types", "//libc:__support_stringutil", @@ -89,10 +91,11 @@ libc_support_library( "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_fpbits_str", "//libc:__support_fputil_rounding_mode", + "//libc:__support_macros_config", "//libc:__support_macros_properties_architectures", + "//libc:hdr_fenv_macros", "//libc:hdr_math_macros", - "//libc:hdr_fenv_macros", - "//libc:types_fenv_t", + "//libc:types_fenv_t", ], ) @@ -110,6 +113,7 @@ libc_support_library( "//libc:__support_cpp_bitset", "//libc:__support_cpp_span", "//libc:__support_cpp_type_traits", + "//libc:__support_macros_config", ], ) @@ -125,6 +129,7 @@ libc_support_library( ":LibcUnitTest", ":string_utils", "//libc:__support_fputil_fp_bits", + "//libc:__support_macros_config", "//libc:printf_core_structs", ], ) @@ -138,5 +143,6 @@ libc_support_library( "//libc:__support_big_int", "//libc:__support_cpp_string", "//libc:__support_cpp_type_traits", + "//libc:__support_macros_config", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel index 2940326d5bfc31..57e3f9f6e9458f 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel @@ -298,8 +298,8 @@ libc_support_library( "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_manipulation_functions", "//libc:hdr_math_macros", - "//libc/test/UnitTest:fp_test_helpers", "//libc/test/UnitTest:LibcUnitTest", + "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -559,7 +559,10 @@ math_test( libc_support_library( name = "sdcomp26094", hdrs = ["sdcomp26094.h"], - deps = ["//libc:__support_cpp_array"], + deps = [ + "//libc:__support_cpp_array", + "//libc:__support_macros_config", + ], ) math_test( diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index b34e281ce0ecd6..a6f9d4f2fdac2a 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -100,10 +100,35 @@ libc_test( libc_function_deps = ["//libc:bsearch"], ) +libc_support_library( + name = "qsort_test_helper", + hdrs = ["SortingTest.h"], + deps = [ + "//libc:qsort_util", + "//libc/test/UnitTest:LibcUnitTest", + ], +) libc_test( name = "qsort_test", srcs = ["qsort_test.cpp"], libc_function_deps = ["//libc:qsort"], + deps = [":qsort_test_helper"], +) +libc_test( + name = "quick_sort_test", + srcs = ["quick_sort_test.cpp"], + deps = [ + ":qsort_test_helper", + "//libc:qsort_util", + ], +) +libc_test( + name = "heap_sort_test", + srcs = ["heap_sort_test.cpp"], + deps = [ + ":qsort_test_helper", + "//libc:qsort_util", + ], ) libc_test( diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel index fb0046e9f89d19..b11bf163473be1 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel @@ -121,6 +121,7 @@ libc_support_library( deps = [ "//libc:__support_cpp_span", "//libc:__support_libc_assert", + "//libc:__support_macros_config", "//libc:__support_macros_sanitizer", "//libc:string_memory_utils", ], diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel index c708f008dec2c7..adf4b235b1b5e3 100644 --- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel @@ -48,6 +48,7 @@ libc_support_library( "//libc:__support_cpp_type_traits", "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_fpbits_str", + "//libc:__support_macros_config", "//libc:__support_macros_properties_types", "//libc:hdr_math_macros", "//libc/test/UnitTest:LibcUnitTest", diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index ac924dec91532c..cc573864e29b1e 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -697,6 +697,7 @@ cc_library( ":TargetHeaders", ":TargetProperties", ":Utility", + "//clang:codegen", "//lldb/source/Plugins:PluginProcessUtility", "//llvm:BinaryFormat", "//llvm:MC", diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 4d443e809d55bd..ae17746c72882a 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -944,7 +944,17 @@ cc_library( srcs = glob([ "lib/IR/*.cpp", "lib/IR/*.h", - ]), + ]) + [ + # To avoid a dependency cycle. + "include/llvm/Analysis/IVDescriptors.h", + "include/llvm/CodeGen/GenVT.inc", + ] + glob( + # To avoid a dependency cycle. + [ + "include/llvm/CodeGen/**/*.h", + "include/llvm/CodeGenTypes/**/*.h", + ], + ), hdrs = glob( [ "include/llvm/*.h", diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index ab3757342c76f5..af542bba57d5f8 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -2900,6 +2900,7 @@ cc_library( ":IR", ":LoopLikeInterface", ":SCFDialect", + ":SCFToControlFlow", ":SCFTransformOpsIncGen", ":SCFTransforms", ":SCFUtils",